This document includes the analysis of simple and complex satellite abundances in different Black/6 and Black/10 mouse strains, and uses the written history (generations at splits along the “phylogeny”) to calculate rates of copy number change.
data <- read.table("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_MA_30x_trimmed_data_kseek.rep.normGC", header=T)
#data[1:5, 1:5]
nrow(data)
## [1] 13749
#colnames(data)
#need to flip the data frame since it's in the opposite orientation
data2 <- data.frame(t(data[-1]))
# match the file names to the names of the lines
#data2[, 1:5]
colnames(data2) <- data[,1]
line_names <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_line_names.csv", header=F)
line_files <- as.vector(line_names[,1])
line_names <- as.vector(line_names[,2])
names(line_names) <- line_files
line_names_df <- as.data.frame(line_names)
data_withnames <- merge (data2, line_names, by=0)
lines <- as.vector(data_withnames[,"y"])
data_withnames[,"y"] <- NULL
data_withnames[,1] <- NULL
rownames (data_withnames) <- lines
#data_withnames[,1:3]
#data2[,1:5]
# simplify the kmer names
kmer.labels <- sapply(strsplit(colnames(data_withnames), "\\/"), '[', 1)
colnames(data_withnames) <- kmer.labels
# now calculate the means and sort the data
kmer_means <- colMeans(data_withnames)
data_sorted <- data_withnames[,order(kmer_means, decreasing=T)]
# first use common kmers, which have an average abundance of at least 100
get_common_kmers <- function (kmer_matrix) {
common.kmer.indexes <- c()
for (i in 1:ncol(kmer_matrix)) {
if (mean(kmer_matrix[,i])>=100) {
common.kmer.indexes <- c(common.kmer.indexes, i)
}
}
names(common.kmer.indexes) <- colnames(kmer_matrix)[common.kmer.indexes]
return (common.kmer.indexes)
}
data_sorted_common <- data_sorted[,get_common_kmers(data_sorted)]
#data_sorted_common[,1:5]
ncol(data_sorted_common)
## [1] 63
#write.csv(data_sorted_common, "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/mouse_common_kmer_abundances.csv")
AG_repeats <- c("AG", "AAG", "AAAG", "AGG", "AAGGAG", "AAGG", "AAAGG", "AAGAG", "AAGGG", "AGAGG", "AAAAAG", "AAAGAG", "AAAGGG", "AAAAAG", "AGAGGG", "AAGAGG", "AAAGGAAG", "AGGG")
data_sorted_common[,AG_repeats]
## AG AAG AAAG AGG AAGGAG AAGG
## BL6/NTac 67256.45 26347.52 7296.452 8533.097 2418.486 2198.468
## BL6/NCrl 69750.00 38055.28 10579.327 8329.694 2560.842 2322.308
## BL6/NHsd 48925.53 16230.10 5543.591 7846.022 1587.977 1526.284
## BL6N-TyrCBrdCrlCrl 70561.23 38934.90 10379.190 9185.140 2827.708 2397.239
## BL6/JBomTac 66683.30 32800.02 9477.913 8461.805 2386.488 2309.149
## BL6/J 60032.36 27832.02 7200.989 9617.130 2744.191 2227.678
## BL10/ScSnJ 64246.86 40959.64 8851.526 10087.380 2986.484 2297.634
## BL10/SnJ 62270.60 42314.92 10284.460 9944.040 2981.819 2302.539
## BL10/ScCr 50443.03 17614.41 5509.765 10019.725 2079.948 1839.200
## BL6/NJ 63688.51 39818.80 9769.525 9866.128 2859.624 2327.099
## BL10/J 66349.53 41738.66 10232.616 8473.754 2656.981 2079.099
## BL6/ByJ 65096.19 36217.82 10223.577 8199.549 2427.937 2149.029
## BL6/JEiJ 62512.38 33133.06 9364.256 8260.159 2339.655 2148.763
## BL10/ScNHsd 63805.98 38974.22 10193.485 7739.205 2386.076 2139.598
## AAAGG AAGAG AAGGG AGAGG AAAAAG
## BL6/NTac 1736.9611 1042.5865 999.5002 535.6924 245.2341
## BL6/NCrl 1881.9835 1211.2068 960.0620 565.0340 219.2628
## BL6/NHsd 875.7659 624.7763 835.4631 449.1689 244.1678
## BL6N-TyrCBrdCrlCrl 1950.5955 1203.5164 1104.6945 629.0574 237.9968
## BL6/JBomTac 1835.7438 1230.6188 944.1000 588.7770 226.2638
## BL6/J 1656.7825 1041.2083 974.7886 573.7315 247.8997
## BL10/ScSnJ 1869.7141 1199.3150 1018.3826 583.8824 229.5712
## BL10/SnJ 1925.2236 1227.2337 983.7501 579.4867 246.9304
## BL10/ScCr 992.8838 685.8580 1007.0211 540.3200 268.6505
## BL6/NJ 1897.0839 1204.7733 1020.5747 644.0331 228.0036
## BL10/J 1762.1477 1211.4311 799.3172 492.8769 271.2413
## BL6/ByJ 1781.3128 1106.8807 881.1411 512.5859 248.5310
## BL6/JEiJ 1737.5669 1089.5450 843.0328 528.1448 258.9078
## BL10/ScNHsd 1792.5522 1162.6397 830.4409 536.9965 216.8835
## AAAGAG AAAGGG AAAAAG.1 AGAGGG AAGAGG AAAGGAAG
## BL6/NTac 263.2982 251.1064 245.2341 223.9919 139.93000 122.36293
## BL6/NCrl 360.0023 253.2182 219.2628 198.2796 154.23436 152.96323
## BL6/NHsd 157.4740 191.8332 244.1678 184.2827 59.50754 60.94818
## BL6N-TyrCBrdCrlCrl 381.7134 297.4409 237.9968 235.0870 170.15005 139.63608
## BL6/JBomTac 359.0054 247.6448 226.2638 221.1700 132.16278 142.68280
## BL6/J 278.0870 294.7840 247.8997 187.6473 128.91459 114.64040
## BL10/ScSnJ 369.5459 297.4225 229.5712 191.3634 136.97390 135.56400
## BL10/SnJ 417.5889 312.8885 246.9304 175.6377 138.50373 138.72122
## BL10/ScCr 164.5873 247.0828 268.6505 206.0989 71.88263 60.08225
## BL6/NJ 417.2335 305.4141 228.0036 210.1792 151.32141 136.65068
## BL10/J 399.0728 264.1721 271.2413 208.6264 127.90973 144.12123
## BL6/ByJ 354.3842 240.8161 248.5310 195.0101 132.82618 143.98770
## BL6/JEiJ 320.7037 235.4233 258.9078 201.1888 118.99387 129.50019
## BL10/ScNHsd 338.5952 212.5425 216.8835 216.0247 124.51339 147.25015
## AGGG
## BL6/NTac 142.7499
## BL6/NCrl 123.2662
## BL6/NHsd 119.5792
## BL6N-TyrCBrdCrlCrl 146.3483
## BL6/JBomTac 133.7977
## BL6/J 122.2764
## BL10/ScSnJ 135.3885
## BL10/SnJ 125.6165
## BL10/ScCr 151.9664
## BL6/NJ 133.4303
## BL10/J 117.8604
## BL6/ByJ 110.3731
## BL6/JEiJ 132.9163
## BL10/ScNHsd 117.4796
sum(kmer_means[AG_repeats])
## [1] 125050.6
AC_repeats <- c("AC", "ACC", "AACC", "AAAAAC", "AAAACC", "AAAAACC", "AAACC")
sum(kmer_means[AC_repeats])
## [1] 192392.3
which(colnames(data_sorted_common)=="AGGG")
## [1] 51
# now make a PCA using these 63 kmers
pca.data <- prcomp(data_sorted_common, scale.=T)
library(ggbiplot)
## Loading required package: ggplot2
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2021a.
## 1.0/zoneinfo/America/New_York'
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, labels=rownames(data_sorted_common))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8,choices=c(1,3), labels=rownames(data_sorted_common))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
# do a PCA using more data - see if there is separation.
get_legit_kmer_indexes <- function (kmer_mx) {
indexes <- c()
for (i in 1:ncol(kmer_mx)) { ## go through each column of the data
p <- kmer_mx[,i] >= 10 ## at least 10 copies
if (sum(p) >= 1) { # if one sample have at least 2 copies
indexes <- c(indexes, i)
}
}
return(indexes)
}
legit_indexes <- get_legit_kmer_indexes(data_sorted)
length(legit_indexes)
## [1] 434
# 434 kmers have at least 10 copies in at least one sample - use these now for the PCA.
legit_data <- data_sorted[,legit_indexes]
legit_kmer_names <- colnames(legit_data)
# is the kmer at high density present?
#legit_data[,"ACACACAGTGGAGCACTGTG"] # no, not present. Too high diversity
#data_sorted[,"ACACACAGTGGAGCACTGTG"]
legit_data[,"AAGCCAGCTGTGGGTAAGG"] # only low abundance. 17-25 copies
## [1] 18.01512 18.78037 18.43254 19.54553 19.79372 20.68253 23.65253
## [8] 23.73084 25.80129 25.45674 19.98419 19.78335 21.77381 17.85810
legit_data[,"AAAAATCTTAAAGG"]
## [1] 107.13230 110.76308 167.61558 102.14928 156.76916 120.55382 56.75502
## [8] 109.61623 233.95632 81.24444 49.65902 38.63920 67.85405 303.27720
mean(legit_data[,"AAAAATCTTAAAGG"]) # highly variable, in the top 63. (range 233-38) missing from the assembly
## [1] 121.856
#write.csv(legit_data, "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/legit_data.csv")
#write.table(kb5_kmers_notkseek, file = "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/kb5_kmers_notkseek", row.names = F, col.names=F, quote=F)
#write.table(kb5_kmers_notkseek, file = "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/kb5_kmers_yeskseek", row.names = F, col.names=F, quote=F)
#write.table(legit_kmer_names, file = "~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/legit_kmer_names.txt", row.names = F, col.names=F, quote=F)
# change the name
# colour them differently
rownames(legit_data)[4] <- "BL6/NTyrCBrdCrlCrl"
strain <- sapply(strsplit(as.vector(rownames(legit_data)), "\\/"), '[', 1)
pca.data <- prcomp(legit_data, scale.=T)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, labels=rownames(legit_data), labels.size=2)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
# here's some separation
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = strain, var.axes = F, alpha=0.8, choices=c(1,3))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, var.axes = F, alpha=0.8, choices=c(1,3), labels=rownames(legit_data), labels.size=2)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
get_abskmer_mx <- function (kmer_mx, indexes) {
kmers.abs <- matrix(nrow=nrow(kmer_mx), ncol=max(indexes))
for (i in indexes) {
for (j in 1:nrow(kmer_mx)) {
kmers.abs[j,i] <- kmer_mx[j,i] * (nchar(colnames(kmer_mx)[i], type="chars"))
}
}
kmers.abs <- kmers.abs[,colSums(is.na(kmers.abs)) != nrow(kmers.abs)]# get rid of columns that are all NAs
rownames(kmers.abs) <- rownames(kmer_mx)
colnames(kmers.abs) <- colnames(kmer_mx)
return(kmers.abs)
}
# get total kmer content per MA line
get_total_kmercontent <- function (abs_mx) {
total.kmers.abs <- c()
for (i in 1:nrow(abs_mx)){
total.kmers.abs[i] <- sum(abs_mx[i,])/10^6
}
names(total.kmers.abs) <- rownames(abs_mx)
return (total.kmers.abs)
}
data_abs <- get_abskmer_mx(legit_data, c(1:ncol(legit_data)))
total_kmer_content <- get_total_kmercontent(data_abs)
total_kmer_content # note, this is in MB
## BL6/NTac BL6/NCrl BL6/NHsd
## 1.222917 1.288536 1.073765
## BL6/NTyrCBrdCrlCrl BL6/JBomTac BL6/J
## 1.393987 1.270839 1.388050
## BL10/ScSnJ BL10/SnJ BL10/ScCr
## 1.472052 1.469211 1.179894
## BL6/NJ BL10/J BL6/ByJ
## 1.469178 1.274572 1.285923
## BL6/JEiJ BL10/ScNHsd
## 1.215779 1.250947
boxplot(total_kmer_content, las=2)
library(phytools)
## Loading required package: ape
## Loading required package: maps
##
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
##
## ozone
mouse_tree <- read.tree("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/manual_ext.tre")
plot(mouse_tree)
# need to rename so they match the tree
new_rownames <- gsub("/", "", names(total_kmer_content))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2
names(total_kmer_content) <- new_rownames2
#total_kmer_content
obj<-contMap(mouse_tree,total_kmer_content,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
Calc_kmer_length <- function (kmer_string) {
return (nchar(kmer_string))
}
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.3.3
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plyr':
##
## rename
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.3.3
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:plyr':
##
## desc
## Loading required package: XVector
## Warning: package 'XVector' was built under R version 3.3.3
##
## Attaching package: 'XVector'
## The following object is masked from 'package:plyr':
##
## compact
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
Calc_GC_content <- function (kmer_string) {
string <- BString(kmer_string)
gc.content <- c(letterFrequency(string, letters=c("CG"), as.prob=F)/(length(string)))
return(gc.content)
}
common.kmers <- colnames(data_sorted_common)
common.kmer.lengths <- sapply(common.kmers, Calc_kmer_length)
names(common.kmer.lengths) <- common.kmers
#common.kmer.lengths[1:10]
common.kmer.gc <- sapply(common.kmers, Calc_GC_content)
names(common.kmer.gc) <- common.kmers
#common.kmer.gc[1:10]
hist(common.kmer.lengths, breaks=20)
summary(common.kmer.lengths) # median = 6
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 4.000 6.000 6.524 6.000 19.000
length(which(common.kmer.lengths==4)) # 11
## [1] 11
length(which(common.kmer.lengths==5)) #8
## [1] 8
length(which(common.kmer.lengths==6)) # 22
## [1] 22
length(common.kmer.lengths)
## [1] 63
hist(common.kmer.gc, breaks=10)
summary(common.kmer.gc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3333 0.5000 0.4456 0.5895 0.7500
# look at legit kmers now
legit.kmers <- colnames(legit_data)
legit.kmer.lengths <- sapply(legit.kmers, Calc_kmer_length)
names(legit.kmer.lengths) <- legit.kmers
legit.kmer.gc <- sapply(legit.kmers, Calc_GC_content)
names(legit.kmer.gc) <- legit.kmers
hist(legit.kmer.lengths, breaks=20)
hist(legit.kmer.gc, breaks=10)
# note: telomere satellite is common.kmers[4]:
common.kmers[4]
## [1] "AACCCT"
total_kmer_content
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 1.222917 1.288536 1.073765 1.393987
## B6JBomTac B6J B10ScSnJ B10SnJ
## 1.270839 1.388050 1.472052 1.469211
## B10ScCr B6NJ B10J B6ByJ
## 1.179894 1.469178 1.274572 1.285923
## B6JEiJ B10ScNHsd
## 1.215779 1.250947
abun_data <- c()
for (i in 1:ncol(data_sorted_common)) {
abun_data <- c(abun_data, data_sorted_common[,i])
}
kmer_names <- c()
for (i in 1:ncol(data_sorted_common)) {
kmer_names <- c(kmer_names, c(rep(colnames(data_sorted_common)[i], times = nrow(data_sorted_common))))
}
rownames(data_sorted_common)[4] <- "BL6/NTyrCBrdCrlCrl"
abun_df <- data.frame(rownames(data_sorted_common), kmer_names, abun_data)
abun_df[1:15, ]
## rownames.data_sorted_common. kmer_names abun_data
## 1 BL6/NTac AC 185511.68
## 2 BL6/NCrl AC 182274.28
## 3 BL6/NHsd AC 182867.67
## 4 BL6/NTyrCBrdCrlCrl AC 192672.86
## 5 BL6/JBomTac AC 188531.48
## 6 BL6/J AC 202091.49
## 7 BL10/ScSnJ AC 202385.62
## 8 BL10/SnJ AC 199890.74
## 9 BL10/ScCr AC 204310.46
## 10 BL6/NJ AC 197374.94
## 11 BL10/J AC 175373.72
## 12 BL6/ByJ AC 178864.78
## 13 BL6/JEiJ AC 180017.66
## 14 BL10/ScNHsd AC 182467.11
## 15 BL6/NTac AG 67256.45
colnames(abun_df) <- c("Sample", "kmer", "Copy")
strain <- sapply(strsplit(as.vector(abun_df$Sample), "\\/"), '[', 1)
abun_df <- data.frame(abun_df, strain)
#abun_df
library(ggplot2)
abun_df$kmer <- factor(abun_df$kmer, levels=colnames(data_sorted_common))
# this plot needs some work
ggplot(data = abun_df[c(1:70),], aes(x = kmer, y = Copy)) +
geom_boxplot() +
geom_jitter(aes(color=strain)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
ggplot(data = abun_df[c(71:140),], aes(x = kmer, y = Copy)) +
geom_boxplot() +
geom_jitter(aes(color=strain)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
# note, this part is needed to make the table I import in:
mouse_tree$tip.label
## [1] "B10SnJ" "B10ScSnJ" "B10ScCr"
## [4] "B10ScNHsd" "B10J" "B6ByJ"
## [7] "B6NCrl" "B6NHsd" "B6NTac"
## [10] "B6NTyrCBrdCrlCrl" "B6NJ" "B6JEiJ"
## [13] "B6JBomTac" "B6J"
mouse_tree$Nnode # 13 nodes now
## [1] 13
findMRCA(mouse_tree, tips=mouse_tree$tip.label, type="node")
## [1] 15
findMRCA(mouse_tree, tips=c("B6J", "B6ByJ"), type="node")
## [1] 20
findMRCA(mouse_tree, tips=c("B6J", "B6JEiJ"), type="node")
## [1] 26
findMRCA(mouse_tree, tips=c("B6J", "B6JBomTac"), type="node")
## [1] 27
findMRCA(mouse_tree, tips=c("B6NJ", "B6ByJ"), type="node")
## [1] 21
findMRCA(mouse_tree, tips=c("B6NJ", "B6NCrl"), type="node")
## [1] 22
findMRCA(mouse_tree, tips=c("B6NJ", "B6NHsd"), type="node")
## [1] 23
findMRCA(mouse_tree, tips=c("B6NJ", "B6NTac"), type="node")
## [1] 24
findMRCA(mouse_tree, tips=c("B6NJ", "B6NTyrCBrdCrlCrl"), type="node")
## [1] 25
findMRCA(mouse_tree, tips=c("B10J", "B10SnJ"), type="node")
## [1] 16
findMRCA(mouse_tree, tips=c("B10J", "B10ScSnJ"), type="node")
## [1] 17
findMRCA(mouse_tree, tips=c("B10J", "B10ScCr"), type="node")
## [1] 18
findMRCA(mouse_tree, tips=c("B10J", "B10ScNHsd"), type="node")
## [1] 19
# here is the node info file:
node_info <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/node_info_Mar17-20.csv", header = T)
# here is a function to calculate mutation rates for our tree, given an abundance matrix
calc_mutation_rates <- function (df_abun) {
mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
for (i in 1:ncol(df_abun)) {
kmer_subset <- df_abun[,i]
names(kmer_subset) <- rownames(df_abun)
anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T)
for (j in 1:nrow(df_abun)) {
subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
mu_mx[j,i] <- (df_abun[j,i] - anc_states$ace[subset_table$nearest_node]) / subset_table$gens
}
}
rownames(mu_mx) <- rownames(df_abun)
colnames(mu_mx) <- colnames(df_abun)
return(mu_mx)
}
# check for total kmer content
anc_states <- fastAnc(mouse_tree, total_kmer_content, vars=T, CI=T)
#anc_states
1.073765 - 1.264150 # lost by B6NHsd since common ancestor with below
## [1] -0.190385
1.469178 - 1.264150 # gained by B6NJ since common ancestor with below
## [1] 0.205028
# here is a function that calculates normalized-by-copy-number mutation rates, given the original abundance matrix and the non-normalized mutation matrix
calc_norm_mu <- function (df_abun, mu_mx) {
mu_mx_norm <- matrix(nrow=nrow(mu_mx), ncol=ncol(mu_mx))
for (i in 1:ncol(mu_mx)) {
kmer_subset <- df_abun[,i]
names(kmer_subset) <- rownames(df_abun)
anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T) # get the ancestral copy number
for (j in 1:nrow(mu_mx)) {
subset_table <- node_info[which(node_info$Sample==rownames(mu_mx)[j]),]
mu_mx_norm[j,i] <- mu_mx[j,i]/anc_states$ace[subset_table$nearest_node]
}
}
rownames(mu_mx_norm) <- rownames(mu_mx)
colnames(mu_mx_norm) <- colnames(mu_mx)
return(mu_mx_norm)
}
# this is a function that calculates absolute mutation rates (taking the absolute value of additions and subtractions).
calc_mutation_rates_abs <- function (df_abun) {
mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
for (i in 1:ncol(df_abun)) {
kmer_subset <- df_abun[,i]
names(kmer_subset) <- rownames(df_abun)
anc_states <- fastAnc(mouse_tree, kmer_subset, vars=T, CI=T)
for (j in 1:nrow(df_abun)) {
subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
mu_mx[j,i] <- (abs(df_abun[j,i] - anc_states$ace[subset_table$nearest_node])) / subset_table$gens
}
}
rownames(mu_mx) <- rownames(df_abun)
colnames(mu_mx) <- colnames(df_abun)
return(mu_mx)
}
# fix the rownames so they match
new_rownames <- gsub("/", "", rownames(data_sorted_common))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2
rownames(data_sorted_common) <- new_rownames2
mu_common <- calc_mutation_rates(data_sorted_common)
per_kmer_mu<- colMeans(mu_common)
length(per_kmer_mu)
## [1] 63
kmer_means_sorted <- sort (kmer_means, decreasing = T)
# check for correlation of abundance with copy number.
plot(x = kmer_means_sorted[1:63], y = per_kmer_mu )
plot (x = kmer_means_sorted[7:63], y = per_kmer_mu[7:63])
# generally a positive correlation
# for each line, do a sign test to determine if kmers have randomly and independently gone up and down - would expect half would be net increase and half would be net decrease
binom_results <- c()
for (i in 1:nrow(mu_common)) {
num_neg <- length(which(mu_common[i, ] < 0))
total <- ncol(mu_common)
p <- binom.test(num_neg, total)
binom_results <- c(binom_results, p$p.value)
}
binom_results
## [1] 8.980472e-04 1.000000e+00 5.152397e-03 4.295655e-02 3.135031e-01
## [6] 5.152397e-03 2.227532e-03 3.760612e-05 8.013065e-01 3.367093e-04
## [11] 1.000000e+00 1.000000e+00 8.980472e-04 4.295655e-02
rownames(mu_common)[which(binom_results < 0.01)]
## [1] "B6NTac" "B6NHsd" "B6J" "B10ScSnJ" "B10SnJ" "B6NJ"
## [7] "B6JEiJ"
# many lines have more increases than decreases or vise versa
### now do for each line - plot the common kmers abundance and mutation rate
#install.packages("ggallin")
library("ggallin")
rownames(data_sorted_common[1,])
## [1] "B6NTac"
a <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[1,]), y = as.numeric(mu_common[1,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NTac") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[2,])
## [1] "B6NCrl"
b <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[2,]), y = as.numeric(mu_common[2,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NCrl") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[3,])
## [1] "B6NHsd"
c <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[3,]), y = as.numeric(mu_common[3,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NHsd") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[4,])
## [1] "B6NTyrCBrdCrlCrl"
d <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[4,]), y = as.numeric(mu_common[4,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NTyrCBrdCrlCrl") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[5,])
## [1] "B6JBomTac"
e <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[5,]), y = as.numeric(mu_common[5,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6JBomTac") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[6,])
## [1] "B6J"
f <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[6,]), y = as.numeric(mu_common[6,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6J") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[7,])
## [1] "B10ScSnJ"
g <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[7,]), y = as.numeric(mu_common[7,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10ScSnJ") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[8,])
## [1] "B10SnJ"
h <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[8,]), y = as.numeric(mu_common[8,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10SnJ") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[9,])
## [1] "B10ScCr"
i <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[9,]), y = as.numeric(mu_common[9,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10ScCr") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[10,])
## [1] "B6NJ"
j <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[10,]), y = as.numeric(mu_common[10,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6NJ") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[11,])
## [1] "B10J"
k <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[11,]), y = as.numeric(mu_common[11,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10J") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[12,])
## [1] "B6ByJ"
l <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[12,]), y = as.numeric(mu_common[12,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6ByJ") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[13,])
## [1] "B6JEiJ"
m <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[13,]), y = as.numeric(mu_common[13,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/6JEiJ") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
rownames(data_sorted_common[14,])
## [1] "B10ScNHsd"
n <- ggplot () +
geom_point(aes (x = as.numeric(data_sorted_common[14,]), y = as.numeric(mu_common[14,]), alpha = 0.5)) +
labs (x="kmer abundance (copies)", y="(copies/generation)", title = "C57BL/10ScNHsd") +
scale_y_continuous(trans = pseudolog10_trans) +
scale_x_log10() +
theme(legend.position = "none")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
##
## combine
grid.arrange(b,g,j,c,a,d,e,f,h,i,k,l,m,n, ncol=2)
#normalize by copy number
mu_common_norm <- calc_norm_mu(data_sorted_common, mu_common)
# per kmer average mutation rate
per_kmer_mu_norm_mean <- colMeans(mu_common_norm)
par(mar = c(2, 2, 2, 2))
plot(per_kmer_mu_norm_mean)
# one kmer has the most negative mutatation rate
which(per_kmer_mu_norm_mean==min(per_kmer_mu_norm_mean)) # normalized by copy number, most contracting
## AAAGACAGACAG
## 27
# this is the one with phylogenetic signal
# verified this one manually to make sure there was no mistake.
highest_kmer <- data_sorted_common[,27]
names(highest_kmer) <- rownames(data_sorted_common)
# look at amounts at each node
anc_states <- fastAnc(mouse_tree, highest_kmer, vars=T, CI=T)
anc_states$ace
## 15 16 17 18 19 20 21
## 420.27294 72.76111 57.51539 43.91212 21.99731 767.78476 788.11806
## 22 23 24 25 26 27
## 805.51032 851.40148 864.22009 872.23266 824.66497 846.29339
obj<-contMap(mouse_tree,highest_kmer,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
par(mar = c(2, 2, 2, 2))
per_line_mu_norm_mean <- rowMeans(mu_common_norm)
plot(per_line_mu_norm_mean) # line 10 is most increasing, line 3 is most decreasing
per_line_mu_norm_mean
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## -5.108446e-04 1.018382e-04 -9.047817e-04 2.279445e-04
## B6JBomTac B6J B10ScSnJ B10SnJ
## -2.278171e-04 4.288457e-04 1.220611e-04 1.862966e-04
## B10ScCr B6NJ B10J B6ByJ
## -3.973537e-04 8.150201e-04 2.706781e-05 -3.471784e-05
## B6JEiJ B10ScNHsd
## -1.783642e-04 -3.200731e-04
# look at the telomere satellite
telomere_kmer <- data_sorted_common[,4]
names(telomere_kmer) <- rownames(data_sorted_common)
telomere_kmer
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 30146.60 24934.31 26172.35 31314.12
## B6JBomTac B6J B10ScSnJ B10SnJ
## 25013.94 45939.98 46029.90 40211.11
## B10ScCr B6NJ B10J B6ByJ
## 35171.94 39748.29 22781.08 30818.52
## B6JEiJ B10ScNHsd
## 24649.66 26089.82
# look at amounts at each node
anc_states <- fastAnc(mouse_tree, telomere_kmer, vars=T, CI=T)
anc_states$ace
## 15 16 17 18 19 20 21 22
## 33122.94 35614.98 35362.01 33648.22 28314.51 30630.91 30000.19 29604.91
## 23 24 25 26 27
## 30121.95 31568.94 33730.17 30980.68 33096.57
obj<-contMap(mouse_tree,telomere_kmer,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# range 22781 - 46030 - factor of 2.
# look at CAG repeats (polyQ)
which(names((data_sorted_common))=="AGC")
## [1] 25
poly_Q <- data_sorted_common[,25]
names(poly_Q) <- rownames(data_sorted_common)
anc_states <- fastAnc(mouse_tree, poly_Q, vars=T, CI=T)
anc_states$ace
## 15 16 17 18 19 20 21 22
## 587.9395 588.3390 589.6813 588.8991 584.2025 587.5399 584.7799 584.6911
## 23 24 25 26 27
## 585.0344 585.1220 585.3344 591.7505 587.1016
obj<-contMap(mouse_tree,poly_Q,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
poly_Q # very tight: 572-613 copies
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 584.8186 582.6070 585.9047 588.1745
## B6JBomTac B6J B10ScSnJ B10SnJ
## 572.0243 591.7187 609.6128 573.4039
## B10ScCr B6NJ B10J B6ByJ
## 595.1478 582.8482 577.5038 577.1086
## B6JEiJ B10ScNHsd
## 613.4282 584.0698
## Let's make a box plot of mutation rates ###
head(mu_common_norm)
## AC AG AAG AACCCT
## B6NTac -1.665755e-04 0.0004075430 -0.0016936235 -0.0004374261
## B6NCrl -1.037055e-04 0.0005307961 0.0010510781 -0.0008913244
## B6NHsd -1.520792e-04 -0.0015642071 -0.0033459144 -0.0009501471
## B6NTyrCBrdCrlCrl 4.070876e-05 0.0010917158 0.0012808578 -0.0011192028
## B6JBomTac -1.399663e-04 0.0004652882 0.0004670982 -0.0021052905
## B6J 4.700042e-04 -0.0004409409 -0.0009093677 0.0033453313
## AGAT AGAGGC AAAG AGG
## B6NTac -0.0036641401 -3.605975e-04 -0.0015142443 -3.295951e-04
## B6NCrl 0.0015920712 -3.000287e-04 0.0010430853 -1.410237e-04
## B6NHsd -0.0043442036 2.057965e-06 -0.0024647120 -6.330540e-04
## B6NTyrCBrdCrlCrl 0.0009625905 2.326053e-04 0.0015827283 -4.349942e-05
## B6JBomTac 0.0017003030 -2.519822e-05 0.0008803666 -3.848845e-04
## B6J -0.0017989410 2.586653e-04 -0.0014021171 7.395840e-04
## ACAGC AAGGAG AAGG ACC
## B6NTac -6.017285e-05 -0.0002962075 7.938251e-05 -0.0006369315
## B6NCrl -6.596555e-04 0.0003701609 4.853483e-04 -0.0001221603
## B6NHsd 3.251943e-04 -0.0023512420 -1.946080e-03 -0.0002215841
## B6NTyrCBrdCrlCrl 1.249040e-03 0.0008304876 8.059319e-04 -0.0005678451
## B6JBomTac -2.921477e-04 -0.0004308801 2.956358e-04 -0.0005847944
## B6J 8.914476e-04 0.0007966639 -1.894608e-05 0.0008934752
## ATCC AAAGG ACAG
## B6NTac -4.760371e-04 0.0001649562 -9.431094e-05
## B6NCrl 2.326906e-05 0.0007292390 -8.334183e-05
## B6NHsd -9.738023e-04 -0.0032640963 -1.418911e-03
## B6NTyrCBrdCrlCrl -2.380564e-04 0.0010688596 2.446654e-03
## B6JBomTac -2.671971e-04 0.0004943666 3.432759e-04
## B6J 5.718671e-04 -0.0003942336 6.148712e-05
## AAGTCTGCACACACATACT ACCT ACACAG
## B6NTac -0.0003693063 -0.0005101438 -0.0002956678
## B6NCrl -0.0007367637 0.0006628411 -0.0001270817
## B6NHsd 0.0003864204 -0.0042674535 -0.0001753629
## B6NTyrCBrdCrlCrl -0.0006380546 0.0026445685 0.0004140957
## B6JBomTac -0.0012574995 0.0006555566 -0.0003819965
## B6J 0.0017008338 -0.0005348536 0.0006864821
## AT AAGAG AAGGG AGAGAGGC
## B6NTac -0.002822038 -0.0002509086 8.403835e-05 -4.090651e-04
## B6NCrl 0.001638562 0.0007967816 9.550605e-05 -9.735448e-05
## B6NHsd -0.004908572 -0.0027839155 -8.956826e-04 -9.252141e-05
## B6NTyrCBrdCrlCrl -0.001023197 0.0008232176 1.132484e-03 8.045671e-04
## B6JBomTac 0.001204549 0.0008527776 5.331611e-05 -2.558212e-05
## B6J -0.000407570 -0.0006053300 3.352709e-04 1.654990e-04
## ACATAT ACAGACAGCACAGCACAGC AGC
## B6NTac -8.973783e-04 -0.0001011831 -5.035019e-06
## B6NCrl 3.277600e-04 -0.0006282529 -2.013794e-05
## B6NHsd -8.395672e-04 0.0005911077 1.077951e-05
## B6NTyrCBrdCrlCrl 6.342445e-04 0.0012820218 7.581416e-05
## B6JBomTac -7.849088e-05 -0.0004538098 -2.213871e-04
## B6J 2.805531e-04 0.0008161647 6.779565e-05
## AGAGG AAAGACAGACAG AAAAACAAGGGAGATAT
## B6NTac -5.451441e-04 0.0001919794 -0.0006024016
## B6NCrl 1.976184e-04 -0.0007518187 -0.0007702676
## B6NHsd -1.271293e-03 0.0009523442 0.0027391458
## B6NTyrCBrdCrlCrl 6.162522e-04 -0.0005997973 -0.0013208665
## B6JBomTac 3.186189e-04 -0.0010862144 -0.0010467956
## B6J 9.018506e-05 0.0015819253 0.0009937361
## AGGC ACAGACAGCACAGC AACC AAAAG
## B6NTac -0.0020117145 4.751776e-05 -6.836915e-04 0.0001320768
## B6NCrl -0.0016632974 -7.066192e-04 2.378089e-04 0.0001860139
## B6NHsd 0.0004286222 6.277708e-04 7.886844e-05 0.0001341426
## B6NTyrCBrdCrlCrl -0.0024026780 1.163767e-03 3.846697e-04 0.0003208179
## B6JBomTac -0.0044165306 -4.766036e-04 2.781311e-04 0.0003026365
## B6J 0.0048059615 9.447221e-04 -3.292601e-04 -0.0004063741
## AAAGAG AGATAT AGCAGG ACAGAT
## B6NTac -0.0016948721 -2.063519e-03 -8.270165e-04 -5.103155e-04
## B6NCrl 0.0008036940 1.502680e-03 -4.141238e-04 2.764000e-04
## B6NHsd -0.0034238043 -1.268949e-03 5.412703e-05 -2.417208e-03
## B6NTyrCBrdCrlCrl 0.0008106058 -2.490464e-04 -5.730695e-04 9.071935e-05
## B6JBomTac 0.0010697903 -2.440358e-04 -9.435134e-04 2.287869e-04
## B6J -0.0011144041 6.836742e-05 1.548868e-03 1.104506e-04
## AAAGGG AACAAG AAAAAG AGGAGGC
## B6NTac -4.677551e-04 2.546155e-05 0.0002801716 -9.187078e-04
## B6NCrl 6.409462e-05 8.371351e-04 -0.0004327796 -5.548012e-04
## B6NHsd -1.681945e-03 -2.853162e-03 0.0001672457 1.284541e-05
## B6NTyrCBrdCrlCrl 7.203940e-04 2.164467e-04 0.0001700484 -1.205617e-03
## B6JBomTac -5.027060e-04 -6.728850e-04 -0.0005210630 -5.620871e-04
## B6J 1.042552e-03 8.818891e-07 0.0002534426 1.029203e-03
## AAGC AGAGGG AGAGAGGCAGAGGC AAAGC
## B6NTac -5.434884e-04 0.0004652802 -3.312943e-04 3.483304e-04
## B6NCrl 7.403019e-04 -0.0001436764 8.352441e-05 7.155242e-05
## B6NHsd -3.233108e-04 -0.0007647574 -1.441311e-04 8.826565e-05
## B6NTyrCBrdCrlCrl 2.236890e-04 0.0011788727 4.182530e-04 5.895693e-04
## B6JBomTac -6.453794e-05 0.0007462053 3.510603e-05 7.007548e-04
## B6J 2.297196e-04 -0.0006735322 2.077488e-04 -6.941516e-04
## ACAGGC AAACAAGGGAGATAT ACAGAG ACAT
## B6NTac -0.0005248930 -0.0008268581 0.0008632881 -0.0010080044
## B6NCrl -0.0008088567 -0.0003203606 -0.0001639874 -0.0001848299
## B6NHsd -0.0002515918 0.0009086674 -0.0005174375 -0.0003440389
## B6NTyrCBrdCrlCrl -0.0003289970 -0.0006655922 -0.0005338290 0.0014361438
## B6JBomTac -0.0005080720 -0.0006975355 -0.0002898178 -0.0005412730
## B6J 0.0012970047 0.0008500562 0.0006739503 0.0003718677
## ACT AGAGAT AGGG AAAAAC
## B6NTac -3.081124e-04 -0.0029479031 0.0005669797 0.0002932379
## B6NCrl -7.990339e-05 0.0013842437 -0.0001663411 -0.0012957452
## B6NHsd -3.130408e-03 -0.0038984548 -0.0005714235 0.0046463337
## B6NTyrCBrdCrlCrl -8.992007e-04 -0.0007953841 0.0009922715 -0.0014301993
## B6JBomTac 3.735892e-04 0.0001349109 0.0003589676 -0.0077436549
## B6J 2.365849e-04 -0.0005866912 -0.0004142656 0.0056165427
## AAGAGG AAAGGAAG AAAAATCTTAAAGG ACTGCT
## B6NTac 0.0001734283 -0.0001211827 6.059315e-05 -0.0005373543
## B6NCrl 0.0010380652 0.0011953256 2.555483e-04 -0.0008290947
## B6NHsd -0.0038070492 -0.0034883860 3.463674e-03 -0.0004780371
## B6NTyrCBrdCrlCrl 0.0020798018 0.0009448833 5.935478e-04 -0.0011862780
## B6JBomTac 0.0002366203 0.0009352782 2.402745e-03 -0.0007848794
## B6J 0.0000189325 -0.0009428198 -1.437855e-04 0.0004148049
## ACACAT AAACC AAAACC AAAAACC
## B6NTac -0.0003328363 4.595988e-05 -0.0008159407 0.0002409840
## B6NCrl 0.0003042913 1.862295e-04 -0.0006498141 -0.0003272137
## B6NHsd -0.0010053734 -3.836788e-04 0.0013755596 0.0021085351
## B6NTyrCBrdCrlCrl -0.0017003803 -3.084245e-04 -0.0009651283 -0.0009297406
## B6JBomTac -0.0006450958 2.966610e-04 -0.0007742960 -0.0015244409
## B6J 0.0012565483 -2.042011e-04 0.0012831101 0.0017776114
## ACACATAT AGCAGGAGGAGG ACATAG
## B6NTac -0.0016465201 -4.637637e-06 -9.896685e-04
## B6NCrl 0.0014689497 -5.021070e-04 9.122955e-04
## B6NHsd -0.0040666019 -2.520583e-05 -2.393602e-03
## B6NTyrCBrdCrlCrl 0.0010072023 3.323291e-04 3.911062e-04
## B6JBomTac 0.0009190992 -3.435794e-04 -6.417093e-05
## B6J -0.0009905018 4.401858e-04 8.034789e-04
# need to get it in format to make a box plot. Want to use mu_common_norm
mutation_rate_data <- c()
kmer_names <- c()
for (i in 1:nrow(mu_common_norm)) {
mutation_rate_data <- c(mutation_rate_data, as.numeric(mu_common_norm[i,]) )
kmer_names <- c(kmer_names, colnames(mu_common_norm))
}
mutation_rate_df <- data.frame(mutation_rate_data, kmer_names)
head(mutation_rate_df)
## mutation_rate_data kmer_names
## 1 -0.0001665755 AC
## 2 0.0004075430 AG
## 3 -0.0016936235 AAG
## 4 -0.0004374261 AACCCT
## 5 -0.0036641401 AGAT
## 6 -0.0003605975 AGAGGC
# a plot for the hypermutators - part I (later add in complex)
# add in all the 14 samples
subset_hypermutators <- mu_common_norm[which(rownames(mu_common_norm)=="B6NHsd" | rownames(mu_common_norm)=="B6NJ"),]
# make this into a dataframe that can be a boxplot.
abun_hypermut <- c()
for (j in 1:ncol(mu_common_norm)) {
for (i in 1:nrow(mu_common_norm)) {
abun_hypermut <- c(abun_hypermut, mu_common_norm[i,j])
}
}
line_names <- c(rep(rownames(mu_common_norm), times = ncol(mu_common_norm) ))
kmer_names <- c()
for (i in 1:ncol(mu_common_norm)) {
kmer_names <- c(kmer_names, c(rep(colnames(mu_common_norm)[i], times = nrow(mu_common_norm))))
}
#now make a dataframe
hypermutator_df <- data.frame(line_names, kmer_names, abun_hypermut)
#abun_df
ggplot(data = hypermutator_df, aes(x = line_names, y = abun_hypermut)) +
geom_boxplot() +
geom_jitter(aes(color=line_names)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
# now absolute mutation rates
mu_common_abs <- calc_mutation_rates_abs(data_sorted_common)
mu_common_abs_norm <- calc_norm_mu(data_sorted_common, mu_common_abs)
per_kmer_mu_abs_norm_mean <- colMeans(mu_common_abs_norm)
sort(per_kmer_mu_abs_norm_mean, decreasing=T)[1:10]
## AAAAATCTTAAAGG AAAAAC AAAGACAGACAG AGAT AGGC
## 0.002199034 0.002024336 0.001949065 0.001690533 0.001666607
## AT AGAGAT ACACATAT AACCCT AAAGAG
## 0.001665801 0.001490404 0.001242575 0.001194939 0.001164543
per_line_mu_abs_norm_mean <- rowMeans(mu_common_abs_norm)
plot(per_kmer_mu_abs_norm_mean) # huh, when we take the absolute value of changes, it doesn't stand out. Only when we take the directional value
which(per_kmer_mu_abs_norm_mean==max(per_kmer_mu_abs_norm_mean))
## AAAAATCTTAAAGG
## 55
range(per_kmer_mu_abs_norm_mean)
## [1] 7.796324e-05 2.199034e-03
sort (per_kmer_mu_abs_norm_mean)
## AGC AGAGAGGCAGAGGC AC
## 7.796324e-05 1.682046e-04 2.065710e-04
## AGAGAGGC AACC AGAGGC
## 2.146289e-04 2.208613e-04 2.348638e-04
## AAAAG AGCAGGAGGAGG ACACAG
## 2.540561e-04 2.579746e-04 2.894117e-04
## AAGC AAACC AAAGC
## 3.236761e-04 3.358232e-04 3.458564e-04
## ACATAT AAAAAG AAGGG
## 3.556181e-04 3.626719e-04 3.646905e-04
## AAGG AGG ACAGAG
## 3.823177e-04 3.945726e-04 4.020140e-04
## AGAGG AGAGGG AGGG
## 4.021618e-04 4.082530e-04 4.225192e-04
## ATCC ACC ACAT
## 4.298685e-04 4.387321e-04 4.458417e-04
## AG AAACAAGGGAGATAT ACAG
## 4.701614e-04 5.491447e-04 5.680881e-04
## ACAGGC AACAAG AAGGAG
## 6.252987e-04 6.364125e-04 6.367376e-04
## AAAGGG AAGTCTGCACACACATACT ACAGAT
## 6.376478e-04 6.483114e-04 6.495259e-04
## ACACAT ACTGCT AGGAGGC
## 6.591535e-04 6.928947e-04 6.999353e-04
## AAGAG AGCAGG AAAGG
## 7.261712e-04 7.288115e-04 7.304379e-04
## ACATAG AAGAGG AAAACC
## 7.437793e-04 7.910306e-04 7.939359e-04
## AAAAACAAGGGAGATAT AGATAT AAAGGAAG
## 8.620851e-04 8.624179e-04 9.217655e-04
## ACT AAAG AAAAACC
## 9.223615e-04 9.625643e-04 9.724336e-04
## ACAGC ACCT AAG
## 9.962429e-04 1.057811e-03 1.098304e-03
## ACAGACAGCACAGC ACAGACAGCACAGCACAGC AAAGAG
## 1.129597e-03 1.139886e-03 1.164543e-03
## AACCCT ACACATAT AGAGAT
## 1.194939e-03 1.242575e-03 1.490404e-03
## AT AGGC AGAT
## 1.665801e-03 1.666607e-03 1.690533e-03
## AAAGACAGACAG AAAAAC AAAAATCTTAAAGG
## 1.949065e-03 2.024336e-03 2.199034e-03
ordered_by_mu <- names(sort (per_kmer_mu_abs_norm_mean))
# let's sort the kmers in order of increasing mutation rate, then change the order in
mutation_rate_df$kmer_names <- factor(mutation_rate_df$kmer_names, levels = ordered_by_mu)
# need to mess with the dimensions
par(mar = c(7,5,2,2))
boxplot (mutation_rate_df[,1] ~ mutation_rate_df[,2], data=mutation_rate_df, las=2, cex.axis=0.5, ylab="Copies changed/gen/copy", outline = F)
abline(h=0, col="red", lwd=2)
# AGC actually has the lowest mutation rate (can I give it a Z score?)
mu <- mean(per_kmer_mu_abs_norm_mean)
sigma <- sd(per_kmer_mu_abs_norm_mean)
(AGC_zscore <- abs(7.584906e-05 - mu)/sigma)
## [1] 1.384379
AGAT <- data_sorted_common[,5]
names(AGAT) <- rownames(data_sorted_common)
obj<-contMap(mouse_tree,AGAT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
par(mar = c(2, 2, 2, 2))
plot(per_line_mu_abs_norm_mean) # 3 and 10 are the highest
mean(mu_common_abs_norm)
## [1] 0.0007450784
# 7.45 E-4 - lower than Daphnia, higher than Chlamy
# very similar to daphnia, daphnia is 2.74 E-3.
# kmers with the highest standard deviation
sd <- c()
for (i in 1:ncol(legit_data)) {
sd <- c(sd, sd(legit_data[,i]))
}
sd
## [1] 9957.1099033 6312.6563626 8606.9135939 8040.9738460 7767.0997694
## [6] 570.5964272 1789.8022690 853.7009587 1254.8007746 378.3459036
## [11] 230.3492923 183.1972275 188.7837067 332.2424407 335.1086236
## [16] 169.6278825 327.5525751 79.1920233 396.9262071 195.3265407
## [21] 90.6380252 40.1450147 48.8291491 176.3744513 12.2412836
## [26] 51.8297817 426.2687198 111.5508598 208.0048271 120.4103667
## [31] 18.2018852 18.9062639 83.6433266 63.1129763 52.7363957
## [36] 45.6476752 36.3686831 42.5154621 16.9383147 39.5320561
## [41] 17.0625560 16.6665497 7.8421324 9.6791553 25.5961594
## [46] 16.3376868 9.3763077 18.6968278 28.5239660 38.9115751
## [51] 12.0728024 57.0572337 29.5033287 29.6012729 73.7009512
## [56] 16.7066702 13.2541626 9.1632612 15.5622560 28.3120289
## [61] 28.2982676 8.4674822 16.9534867 6.5888211 30.3089501
## [66] 11.0937146 4.2005369 10.9788189 18.0298987 19.3507631
## [71] 34.8572776 56.1160209 7.4906817 40.5618313 19.9825589
## [76] 9.0076942 25.2730707 7.5890319 26.5252713 57.4568452
## [81] 10.7861078 2.7511383 12.0045824 6.1175097 4.8028498
## [86] 10.9729619 14.7432785 13.7463975 7.5361169 6.5703485
## [91] 9.9003758 9.4386799 8.7701417 4.3872420 11.3718690
## [96] 3.7426031 5.3193396 4.2162387 5.0405273 5.5037682
## [101] 22.5642308 19.1164460 7.9308215 36.4249219 9.6125092
## [106] 6.5136700 6.2043048 2.9429164 3.9857219 8.8981661
## [111] 5.4297570 6.9444788 5.8663832 2.6372233 11.6857256
## [116] 1.8536703 3.4002931 8.7891692 5.5076366 6.0821834
## [121] 6.5291210 8.4407447 10.0860729 3.7469204 4.2874332
## [126] 2.5586677 2.5250416 2.9386673 2.3428130 6.2776984
## [131] 5.3380183 4.1794643 22.8362892 7.5265612 1.7055315
## [136] 3.6251102 5.4020038 6.3895524 4.4102022 1.8506297
## [141] 4.0404523 3.1878192 7.7663491 6.6728866 2.2278659
## [146] 5.0653438 3.6532146 5.3769198 2.2550674 15.1614854
## [151] 5.0684435 1.7931575 5.4407406 3.1444834 9.5947008
## [156] 4.9877285 9.2424676 13.6021024 8.7173894 4.0570806
## [161] 1.4314800 3.6177478 3.7849046 4.6055801 8.4295643
## [166] 2.5659941 4.5006411 6.7777854 1.7749897 2.8129887
## [171] 1.5803899 3.4992244 1.8896081 2.8183649 7.5265216
## [176] 2.0386250 1.9564274 5.8145051 3.2509880 2.9654171
## [181] 1.5851584 2.6954999 6.0079811 3.6830944 2.8845621
## [186] 1.1851385 6.0994855 1.0799653 3.5854369 1.4394000
## [191] 5.5539800 3.2340652 10.5070088 3.4647804 1.3510858
## [196] 2.0655693 1.7046625 6.5714931 9.0683079 4.0883660
## [201] 7.3734443 6.1834480 2.6100220 1.7239635 7.0765073
## [206] 2.0439891 6.3740054 7.0660472 4.2871014 3.2467072
## [211] 4.3908385 1.9508601 1.0489485 2.3859475 1.4774957
## [216] 2.9289521 1.7513095 2.9867118 2.6466406 1.8556864
## [221] 3.7742848 4.9056627 3.4135294 4.5098690 1.6249126
## [226] 2.7804980 3.2296228 1.9336527 7.9649253 12.5841699
## [231] 5.0899922 2.2379486 5.3368401 2.3436934 1.7420621
## [236] 2.8157543 1.4387750 1.8571999 3.4961983 3.7389719
## [241] 2.8357502 4.6999486 3.2925898 2.8595538 6.8921808
## [246] 1.9043079 2.5690652 2.4699129 1.9623987 3.3316314
## [251] 3.6181429 2.5846463 3.3029881 1.5440596 3.1623241
## [256] 2.4231367 2.4466166 1.3566438 3.2627182 2.5958148
## [261] 2.1936039 1.4168219 1.7815376 1.5556389 3.8235453
## [266] 2.6905016 2.9683035 3.8768662 2.1269965 1.7652066
## [271] 1.2261324 1.3882335 4.0067122 2.1624217 2.3688705
## [276] 2.1677589 3.6576346 5.7103506 2.5239538 2.1370961
## [281] 3.1429249 1.2378197 1.6942991 3.7154594 1.7665249
## [286] 1.8318136 7.1635150 1.0855347 2.0048945 1.2521898
## [291] 4.4784983 1.8595823 1.8329521 2.5334187 1.9583700
## [296] 3.6637937 2.1584051 1.8295409 4.0441783 1.1260898
## [301] 1.8069938 2.9500987 2.4660596 1.3082240 1.7598669
## [306] 1.3668529 1.5998933 2.6041545 0.7136022 2.8426739
## [311] 1.9263509 1.2703996 3.6812064 1.3008995 1.6664135
## [316] 3.3832089 1.4627867 3.4932053 1.8401759 2.6107575
## [321] 1.7265176 3.1925716 1.2028047 1.7925209 0.9711582
## [326] 2.4052882 1.4040474 1.5208721 0.8181110 2.7046235
## [331] 3.0417052 1.3560903 2.8157875 1.0381225 5.2992271
## [336] 2.4086212 4.4962079 1.3445884 4.0614032 2.4140260
## [341] 0.8366538 2.1216626 2.5637281 2.1851882 2.2663108
## [346] 1.4130576 1.9744802 2.4486451 4.7282184 1.1728564
## [351] 1.0135909 1.7697420 1.3894022 1.4507574 3.1122158
## [356] 1.7618655 1.3123523 1.2128617 1.6421444 1.5434975
## [361] 1.8326516 3.6378101 1.5358566 3.8486163 1.8863765
## [366] 1.3196631 1.9309178 1.7367908 1.1554598 2.4717763
## [371] 1.6266946 1.5642404 1.6221090 1.2485554 1.5601434
## [376] 1.8255976 0.9761845 1.5479371 1.8484187 1.2483611
## [381] 2.6866403 1.0365166 2.8019063 1.9911157 1.5089319
## [386] 2.3925958 1.5062018 1.2215534 1.9065261 1.2711116
## [391] 1.6105958 1.5843233 1.3697030 1.5212503 3.0643399
## [396] 1.3743428 2.4238683 1.7816216 1.2306241 1.6996059
## [401] 1.8394960 1.6682816 1.4243895 2.1893513 1.9615961
## [406] 2.7719094 1.4685968 1.6734613 2.2953463 1.6696921
## [411] 2.8304658 2.6520751 2.3253428 1.2501434 2.6526685
## [416] 2.3812287 1.8513401 2.5513152 2.5143391 2.5880911
## [421] 2.6372221 2.3188088 1.9510371 2.5832412 2.2589268
## [426] 2.3529869 6.0786728 2.6662312 2.4937127 2.3805265
## [431] 3.5344310 2.8935189 3.4683505 2.2724736
calc_mutation_rates_abs_random <- function (df_abun) {
mu_mx <- matrix(nrow=nrow(df_abun), ncol=ncol(df_abun))
for (i in 1:ncol(df_abun)) {
anc_states <- fastAnc(mouse_tree, sample(df_abun[,i]), vars=T, CI=T)
for (j in 1:nrow(df_abun)) {
subset_table <- node_info[which(node_info$Sample==rownames(df_abun)[j]),]
mu_mx[j,i] <- (abs(df_abun[j,i] - anc_states$ace[subset_table$nearest_node])) / subset_table$gens
}
}
rownames(mu_mx) <- rownames(df_abun)
colnames(mu_mx) <- colnames(df_abun)
return(mu_mx)
}
new_rownames <- gsub("/", "", rownames(legit_data))
new_rownames2 <- gsub("L", "", new_rownames)
#new_rownames2
rownames(legit_data) <- new_rownames2
mu_mx_rand_1 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_1 <- calc_norm_mu(legit_data, mu_mx_rand_1)
plot(colMeans(mu_mx_rand_norm_1))
#great, we have a few outliers
sort(colMeans(mu_mx_rand_norm_1), decreasing=T)[1:7]
## AAAGACAG AAAGACAGACAG AGCATGG AAAGAAAGG
## 0.044398979 0.025758444 0.010513202 0.005971288
## AAAGAAAGAAAGAAAGG AAATGAAT AAAGAAGAAG
## 0.005559348 0.004930817 0.004675407
which(colnames(legit_data)=="AAAGACAG")
## [1] 104
which(colnames(legit_data)=="AAAGACAGACAG")
## [1] 27
which(colnames(legit_data)=="AGCATGG")
## [1] 427
which(colnames(legit_data)=="AAAGAAAGG")
## [1] 80
which(colnames(legit_data)=="AAATGAAT")
## [1] 230
which(colnames(legit_data)=="AAGATAGATAGATCT")
## [1] 433
which(colnames(legit_data)=="AAAGAAAGAAAGAAAGG")
## [1] 133
AAAGACAG <- legit_data[,104]
names(AAAGACAG) <- rownames(legit_data)
AAAGACAG
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 68.345280 57.793219 86.259274 71.408165
## B6JBomTac B6J B10ScSnJ B10SnJ
## 49.622705 94.690465 0.000000 0.000000
## B10ScCr B6NJ B10J B6ByJ
## 0.000000 81.809260 1.901294 51.787538
## B6JEiJ B10ScNHsd
## 62.088058 0.000000
obj<-contMap(mouse_tree,AAAGACAG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AAAGACAGACAG <- legit_data[,27]
names(AAAGACAGACAG) <- rownames(legit_data)
obj<-contMap(mouse_tree,AAAGACAGACAG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AAAGACAGACAG
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 881.309078 698.319519 963.295641 838.750237
## B6JBomTac B6J B10ScSnJ B10SnJ
## 739.659689 1001.591049 5.401604 4.415329
## B10ScCr B6NJ B10J B6ByJ
## 5.880901 919.069355 8.217183 729.856715
## B6JEiJ B10ScNHsd
## 843.105703 3.901369
# yes good one
AGCATGG <- legit_data[,427]
names(AGCATGG) <- rownames(legit_data)
obj<-contMap(mouse_tree,AGCATGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AGCATGG
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 4.234116 2.470631 2.183164 1.252682
## B6JBomTac B6J B10ScSnJ B10SnJ
## 2.135496 2.137580 15.617626 14.967658
## B10ScCr B6NJ B10J B6ByJ
## 17.962139 3.360821 11.100449 1.949454
## B6JEiJ B10ScNHsd
## 1.188934 9.442596
AAAGAAAGG <- legit_data[,80]
names(AAAGAAAGG) <- rownames(legit_data)
obj<-contMap(mouse_tree,AAAGAAAGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AAAGAAAGG
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 40.503028 95.985217 10.194145 115.770502
## B6JBomTac B6J B10ScSnJ B10SnJ
## 140.923140 50.373755 26.822014 27.493762
## B10ScCr B6NJ B10J B6ByJ
## 4.731663 202.989221 24.439834 58.854640
## B6JEiJ B10ScNHsd
## 95.534331 24.380243
AAATGAAT <- legit_data[,230]
names(AAATGAAT) <- rownames(legit_data)
# not really, just one pair.
obj<-contMap(mouse_tree,AAATGAAT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# no
AAGATAGATAGATCT <- legit_data[,433]
names(AAGATAGATAGATCT) <- rownames(legit_data)
# nothing here
obj<-contMap(mouse_tree,AAGATAGATAGATCT,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
AAAGAAAGAAAGAAAGG <- legit_data[,133]
names(AAAGAAAGAAAGAAAGG) <- rownames(legit_data)
obj<-contMap(mouse_tree,AAAGAAAGAAAGAAAGG,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# do it again and see if you get any additional
mu_mx_rand_2 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_2 <- calc_norm_mu(legit_data, mu_mx_rand_2)
par(mar = c(2, 2, 2, 2))
plot(colMeans(mu_mx_rand_norm_2))
mu_mx_rand_3 <- calc_mutation_rates_abs_random(legit_data)
mu_mx_rand_norm_3 <- calc_norm_mu(legit_data, mu_mx_rand_3)
par(mar = c(2, 2, 2, 2))
plot(colMeans(mu_mx_rand_norm_3))
# nope, no others are having phylo signal
# have 6 kmers with phylogenetic signal
complex <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/complex_revised_cn.csv", header=T)
samples <- as.vector(complex$Sample)
complex <- as.matrix(complex[,c(2:3)])
rownames(complex) <- samples
complex
## cn_major cn_minor
## B6JEiJ 1048990.8 123362.6
## B10ScNHsd 1060820.1 135657.5
## B6NHsd 1317473.6 129958.4
## B6NTac 1207882.2 136154.8
## B6NJ 1031331.9 129371.0
## B6NTyrCBrdCrlCrl 1239977.1 137196.5
## B6JBomTac 999881.7 130793.7
## B6J 1118980.8 124911.5
## B10ScCr 1068573.0 128685.2
## B10SnJ 923277.8 129303.0
## B6ByJ 1069595.9 125996.3
## B6NCrl 954977.0 123889.9
## B10ScSnJ 971019.3 133243.0
## B10J 1056396.0 143527.2
# visualize:
range(complex[,1])
## [1] 923277.8 1317473.6
# first, major
obj<-contMap(mouse_tree,complex[,1],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# B6NHsd expansion (1317473), B6NJ loss (1031331.9)
complex[,1]
## B6JEiJ B10ScNHsd B6NHsd B6NTac
## 1048990.8 1060820.1 1317473.6 1207882.2
## B6NJ B6NTyrCBrdCrlCrl B6JBomTac B6J
## 1031331.9 1239977.1 999881.7 1118980.8
## B10ScCr B10SnJ B6ByJ B6NCrl
## 1068573.0 923277.8 1069595.9 954977.0
## B10ScSnJ B10J
## 971019.3 1056396.0
anc_states <- fastAnc(mouse_tree, complex[,1], vars=T, CI=T)
anc_states
## Ancestral character estimates using fastAnc:
## 15 16 17 18 19 20 21 22 23
## 1045425 1009114 1014464 1027204 1045385 1081737 1098129 1109461 1157943
## 24 25 26 27
## 1161155 1147246 1065387 1062584
##
## Variances on ancestral states:
## 15 16 17 18 19 20
## 16995854715 5103246361 4400057772 4106422853 3438407147 4700132074
## 21 22 23 24 25 26
## 3277108126 2784220764 2344618027 1996686318 1609537905 3474444128
## 27
## 2832675811
##
## Lower & upper 95% CIs:
## lower upper
## 15 789903.6 1300947
## 16 869097.3 1149130
## 17 884451.9 1144477
## 18 901604.9 1152804
## 19 930455.1 1160316
## 20 947363.8 1216109
## 21 985926.4 1210331
## 22 1006040.4 1212882
## 23 1063037.1 1252848
## 24 1073573.6 1248736
## 25 1068612.2 1225879
## 26 949856.0 1180918
## 27 958267.5 1166901
#major satellite at ancestor of B6NHsd and B6NJ: 1157943
# gained by B6NHsd:
1317473-1157943 #159,530 copies
## [1] 159530
# lost by B6NJ
1157943-1031331.9 # 126,611 copies lost
## [1] 126611.1
# second, minor
obj<-contMap(mouse_tree,complex[,2],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
complex[,2] # b6NHsd: 129958.4 , B6NJ: 129371
## B6JEiJ B10ScNHsd B6NHsd B6NTac
## 123362.6 135657.5 129958.4 136154.8
## B6NJ B6NTyrCBrdCrlCrl B6JBomTac B6J
## 129371.0 137196.5 130793.7 124911.5
## B10ScCr B10SnJ B6ByJ B6NCrl
## 128685.2 129303.0 125996.3 123889.9
## B10ScSnJ B10J
## 133243.0 143527.2
anc_states <- fastAnc(mouse_tree, complex[,2], vars=T, CI=T)
# 130624.7
# how many copies and kb did these lines lose and gain?
130624.7 -129958.4 # B6NHsd : lost 666 copies
## [1] 666.3
130624.7 - 129371 # B6NJ : lost 1253 copies
## [1] 1253.7
mu_complex <- calc_mutation_rates(complex)
#mu_complex
mu_complex_norm <- calc_norm_mu(complex, mu_complex)
mu_complex_abs <- calc_mutation_rates_abs(complex)
mean(mu_complex_abs[,2])
## [1] 26.05974
mu_complex_abs_norm <- calc_norm_mu(complex, mu_complex_abs)
#mu_complex_abs_norm
colMeans(mu_complex_abs)
## cn_major cn_minor
## 559.13985 26.05974
colMeans(mu_complex_abs_norm)
## cn_major cn_minor
## 0.0005004309 0.0001976812
#major: 4.8 E-4, minor: 1.9 E-4
# major satellite has a higher mutation rate than minor, even after normalizing
# might be because of increased recombination -- more distal to the centromere
library(corrplot)
#http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software
combined_mu_norm <- cbind(mu_complex_norm, mu_common_norm)
combined_mu_norm[,1:10]
## cn_major cn_minor AC AG
## B6JEiJ -9.160690e-05 -1.615809e-04 -1.665755e-04 4.075430e-04
## B10ScNHsd 1.069893e-04 -7.387808e-05 -1.037055e-04 5.307961e-04
## B6NHsd 9.983401e-04 -3.696561e-05 -1.520792e-04 -1.564207e-03
## B6NTac 3.907011e-04 2.792860e-04 4.070876e-05 1.091716e-03
## B6NJ -1.578695e-03 -4.101223e-04 -1.399663e-04 4.652882e-04
## B6NTyrCBrdCrlCrl 1.262965e-03 5.102066e-04 4.700042e-04 -4.409409e-04
## B6JBomTac -5.087033e-04 2.367385e-04 1.526023e-04 1.838519e-04
## B6J 4.575415e-04 -1.616087e-04 9.140961e-05 4.974917e-05
## B10ScCr 1.728454e-04 -1.568226e-04 2.490011e-04 -7.222850e-04
## B10SnJ -2.893223e-04 -8.429768e-05 4.230222e-04 -5.365049e-04
## B6ByJ -1.255219e-04 -8.906739e-05 -3.731076e-04 3.595034e-04
## B6NCrl -7.866809e-04 -2.184930e-04 -1.704128e-04 9.820323e-05
## B10ScSnJ -1.597972e-04 6.259891e-06 -2.642960e-04 -6.374373e-05
## B10J 7.632285e-05 3.422101e-04 -9.510261e-05 6.792768e-05
## AAG AACCCT AGAT AGAGGC
## B6JEiJ -0.0016936235 -0.0004374261 -0.0036641401 -3.605975e-04
## B10ScNHsd 0.0010510781 -0.0008913244 0.0015920712 -3.000287e-04
## B6NHsd -0.0033459144 -0.0009501471 -0.0043442036 2.057965e-06
## B6NTac 0.0012808578 -0.0011192028 0.0009625905 2.326053e-04
## B6NJ 0.0004670982 -0.0021052905 0.0017003030 -2.519822e-05
## B6NTyrCBrdCrlCrl -0.0009093677 0.0033453313 -0.0017989410 2.586653e-04
## B6JBomTac 0.0005484810 0.0011256591 0.0001359405 2.273234e-04
## B6J 0.0005808924 0.0004389475 0.0012644830 9.134611e-05
## B10ScCr -0.0020978667 0.0001943518 -0.0025471971 2.915746e-04
## B10SnJ 0.0016646539 0.0027878050 0.0037813664 5.069733e-04
## B6ByJ 0.0007397267 -0.0014161398 0.0005660927 -4.350530e-04
## B6NCrl 0.0005409763 0.0001317758 0.0005682900 -1.447509e-04
## B10ScSnJ 0.0002449262 -0.0012163915 -0.0002705450 -2.517623e-04
## B10J 0.0002107909 -0.0005693525 0.0004712985 -1.601563e-04
## AAAG AGG
## B6JEiJ -1.514244e-03 -3.295951e-04
## B10ScNHsd 1.043085e-03 -1.410237e-04
## B6NHsd -2.464712e-03 -6.330540e-04
## B6NTac 1.582728e-03 -4.349942e-05
## B6NJ 8.803666e-04 -3.848845e-04
## B6NTyrCBrdCrlCrl -1.402117e-03 7.395840e-04
## B6JBomTac -1.279014e-06 2.846829e-04
## B6J 4.982097e-04 1.979003e-04
## B10ScCr -1.573901e-03 3.587638e-04
## B10SnJ 5.719602e-04 1.111715e-03
## B6ByJ 4.967773e-04 -9.523429e-05
## B6NCrl 6.199326e-04 -1.910845e-04
## B10ScSnJ 3.594212e-04 -2.978637e-04
## B10J 4.671663e-04 -7.151320e-04
cor_mu <- cor(combined_mu_norm)
par(mar=c(3,3,10,2))
corrplot(cor_mu, type="upper", order="original", tl.col="black", tl.cex=0.5, tl.srt=90)
# relate copy number changes to sequence variability in the complex satellites
variability <- read.csv(file="~/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/variation_centromere_sats.csv", header = T)
variability
## File Line error_rate_major error_rate_minor mu_major
## 1 R104 BL6NTac 0.0356000 0.0448000 453.6645
## 2 R114 BL6NCrl 0.0355000 0.0446000 -872.7919
## 3 R124 BL6NHsd 0.0352000 0.0447000 1156.0206
## 4 R134 BL6NTyrCBrdCrlCrl 0.0354000 0.0448231 1448.9304
## 5 R144 BL6JBomTac 0.0358362 0.0447000 -540.5402
## 6 R2 BL6J 0.0350000 0.0444000 486.1765
## 7 R22 BL10ScSnJ 0.0365000 0.0441000 -162.1085
## 8 R32 BL10SnJ 0.0353000 0.0423000 -291.9592
## 9 R42 BL10ScCr 0.0349000 0.0421000 177.5475
## 10 R52 BL6NJ 0.0352000 0.0443000 -1811.1508
## 11 R62 BL10J 0.0358000 0.0431000 79.7868
## 12 R72 BL6ByJ 0.0355000 0.0447000 -137.8392
## 13 R84 BL6JEiJ 0.0357000 0.0448000 -97.5968
## 14 R94 BL10ScNHsd 0.0357000 0.0431000 111.8450
## mu_minor
## 1 36.9628491
## 2 -28.1580321
## 3 -4.8286223
## 4 67.7851609
## 5 30.1363125
## 6 -20.5724470
## 7 0.8326899
## 8 -11.1769438
## 9 -20.9461098
## 10 -54.4881282
## 11 46.9015225
## 12 -11.4329535
## 13 -20.4892330
## 14 -10.1253442
ggplot(data = variability, aes(x = mu_major, y = error_rate_major)) +
geom_point() +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
ggplot(data = variability, aes(x = mu_minor, y = error_rate_minor)) +
geom_point() +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
data_subset <- data_sorted_common[,5]
names(data_subset) <- rownames(data_sorted_common)
test_phlosig <- phylosig(mouse_tree, data_subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
data_subset <- data_sorted_common[,27]
names(data_subset) <- rownames(data_sorted_common)
test_phlosig <- phylosig(mouse_tree, data_subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
test_phlosig$P
## [1] 0.001
phylosig_test <- c()
for (i in 1:ncol(legit_data)) {
subset <- legit_data[,i]
names(subset) <- rownames(legit_data)
test_phlosig <- phylosig(mouse_tree, subset, method="K", test=TRUE, nsim=1000, se=NULL, start=NULL)
phylosig_test <- c(phylosig_test, test_phlosig$P)
}
possibilities <- which(phylosig_test < 0.05)
filtered_possibilities <- c()
for (i in possibilities) {
prop <- min(legit_data[,i])/max(legit_data[,i])
diff <- max(legit_data[,i]) - min(legit_data[,i])
if (prop < 0.3 & diff > 15){
filtered_possibilities <- c(filtered_possibilities, i)
}
}
filtered_possibilities # this is pretty good. But for some reason 80 isn't showing up
## [1] 27 71 72 104 349 427
# the ones in the table:
which(colnames(legit_data)=="AAAGACAGACAG") #27
## [1] 27
which(colnames(legit_data)=="AAAGACAG") #104
## [1] 104
which(colnames(legit_data)=="AGCATGG") # 427
## [1] 427
which(colnames(legit_data)=="AAAGAAAGG") #80
## [1] 80
which(colnames(legit_data)=="AGGGC") #71
## [1] 71
which(colnames(legit_data)=="AAGAAGAAGAGG") #72
## [1] 72
min(legit_data[,27])
## [1] 3.901369
# check all these
phylosig_test[1:10]
## [1] 0.124 0.628 0.343 0.443 0.627 0.105 0.515 0.183 0.589 0.492
phylosig_test[80] # why not significant? this one looks legit.
## [1] 0.439
colnames(legit_data)[15]
## [1] "ACAG"
check <- legit_data[,15]
names(check) <- rownames(legit_data)
check
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 1623.5399 1521.7861 1250.1318 2013.6151
## B6JBomTac B6J B10ScSnJ B10SnJ
## 1680.3956 1627.5711 1148.3129 1168.5650
## B10ScCr B6NJ B10J B6ByJ
## 685.9261 1637.6058 1138.5821 1530.7700
## B6JEiJ B10ScNHsd
## 1546.1709 1123.8266
obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
colnames(legit_data)[71]
## [1] "AGGGC"
check_2 <- legit_data[,71]
names(check_2) <- rownames(legit_data)
check_2
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 56.22773 77.83892 82.72548 86.91108
## B6JBomTac B6J B10ScSnJ B10SnJ
## 51.54837 53.20102 84.08393 189.24518
## B10ScCr B6NJ B10J B6ByJ
## 107.44587 88.20542 99.42593 64.28542
## B6JEiJ B10ScNHsd
## 57.50664 86.89554
obj<-contMap(mouse_tree,check_2,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
colnames(legit_data)[72]
## [1] "AAGAAGAAGAGG"
check_3 <- legit_data[,72]
names(check_3) <- rownames(legit_data)
obj<-contMap(mouse_tree,check_3,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
colnames(legit_data)[80]
## [1] "AAAGAAAGG"
check <- legit_data[,80]
names(check) <- rownames(legit_data)
obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
#check
# want to apply some sort of filtering - requiring a difference of x%?
colnames(legit_data)[349]
## [1] "AAAGAAAGGAAGGAAGG"
check <- legit_data[,349]
names(check) <- rownames(legit_data)
obj<-contMap(mouse_tree,check,plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
# this one is good
check
## B6NTac B6NCrl B6NHsd B6NTyrCBrdCrlCrl
## 7.6277800 9.2428867 2.3568429 8.5065947
## B6JBomTac B6J B10ScSnJ B10SnJ
## 13.2646907 17.5286549 9.6558086 11.3011399
## B10ScCr B6NJ B10J B6ByJ
## 2.5862785 9.5971337 8.6801232 0.1576554
## B6JEiJ B10ScNHsd
## 14.2165408 9.8961555
Ymin <- read.csv("/Users/jullienflynn/Documents/AndyLab/mouseprojectsummaryandinfo/my_analysis/Ymin_cn.csv", header=T)
samples <- as.vector(Ymin$Sample)
Ymin <- as.matrix(Ymin$cn_HOR_Ymin)
rownames(Ymin) <- samples
Ymin
## [,1]
## B6JEiJ 28.63662
## B10ScNHsd 28.77115
## B6NHsd 21.89659
## B6NTac 28.92109
## B6NJ 20.58688
## B6NTyrCBrdCrlCrl 30.30322
## B6JBomTac 29.80160
## B6J 26.39369
## B10ScCr 20.03651
## B10SnJ 26.36346
## B6ByJ 27.44637
## B6NCrl 28.15001
## B10ScSnJ 25.73320
## B10J 28.53049
colnames(Ymin) <- "cn_HOR_Ymin"
# visualize:
range(Ymin[,1])
## [1] 20.03651 30.30322
# first, major
obj<-contMap(mouse_tree,Ymin[,1],plot=FALSE)
plot(obj,legend=0.7*max(nodeHeights(mouse_tree)),
fsize=c(0.7,0.9))
anc_states <- fastAnc(mouse_tree, Ymin[,1], vars=T, CI=T)
anc_states
## Ancestral character estimates using fastAnc:
## 15 16 17 18 19 20 21 22
## 26.44703 25.73685 25.63808 25.49412 27.32168 27.15721 26.81652 26.57562
## 23 24 25 26 27
## 25.92030 26.34371 25.85353 27.87069 27.97749
##
## Variances on ancestral states:
## 15 16 17 18 19 20 21
## 23.326382 7.004077 6.038968 5.635962 4.719127 6.450813 4.497748
## 22 23 24 25 26 27
## 3.821273 3.217929 2.740402 2.209050 4.768587 3.887776
##
## Lower & upper 95% CIs:
## lower upper
## 15 16.98074 35.91332
## 16 20.54967 30.92403
## 17 20.82152 30.45465
## 18 20.84104 30.14719
## 19 23.06386 31.57950
## 20 22.17911 32.13530
## 21 22.65978 30.97327
## 22 22.74420 30.40705
## 23 22.40433 29.43626
## 24 23.09909 29.58832
## 25 22.94041 28.76665
## 26 23.59062 32.15076
## 27 24.11287 31.84211
mu_Ymin <- calc_mutation_rates(Ymin)
#mu_complex
mu_Ymin
## cn_HOR_Ymin
## B6JEiJ 0.0045591078
## B10ScNHsd 0.0105033902
## B6NHsd -0.0291572766
## B6NTac 0.0250231070
## B6NJ -0.0822913756
## B6NTyrCBrdCrlCrl 0.0695263271
## B6JBomTac 0.0157250869
## B6J -0.0136534962
## B10ScCr -0.0234232196
## B10SnJ 0.0021313171
## B6ByJ 0.0030427341
## B6NCrl 0.0088948335
## B10ScSnJ 0.0003549335
## B10J 0.0087594744
mean(mu_Ymin)
## [1] -3.611772e-07
mu_Ymin_norm <- calc_norm_mu(Ymin, mu_Ymin)
mu_Ymin_norm
## cn_HOR_Ymin
## B6JEiJ 1.635807e-04
## B10ScNHsd 3.844343e-04
## B6NHsd -1.124882e-03
## B6NTac 9.498703e-04
## B6NJ -3.182984e-03
## B6NTyrCBrdCrlCrl 2.689239e-03
## B6JBomTac 5.620620e-04
## B6J -4.880171e-04
## B10ScCr -9.187696e-04
## B10SnJ 8.281188e-05
## B6ByJ 1.134649e-04
## B6NCrl 3.346990e-04
## B10ScSnJ 1.384399e-05
## B10J 3.206053e-04
# absolute mutation rate
mu_Ymin_abs <- calc_mutation_rates_abs(Ymin)
mean(mu_Ymin_abs)
## [1] 0.02121755
range(mu_Ymin_abs)
## [1] 0.0003549335 0.0822913756
mu_Ymin_abs_norm <- calc_norm_mu(Ymin, mu_Ymin_abs)
mean(mu_Ymin_abs_norm)
## [1] 0.0008092332
mu_complex_norm_all <- cbind(mu_complex_norm, mu_Ymin_norm)
hypermutator_df$type <- "simple"
head(hypermutator_df)
## line_names kmer_names abun_hypermut type
## 1 B6NTac AC -1.665755e-04 simple
## 2 B6NCrl AC -1.037055e-04 simple
## 3 B6NHsd AC -1.520792e-04 simple
## 4 B6NTyrCBrdCrlCrl AC 4.070876e-05 simple
## 5 B6JBomTac AC -1.399663e-04 simple
## 6 B6J AC 4.700042e-04 simple
abun_complex <- c()
for (j in 1:ncol(mu_complex_norm_all)) {
for (i in 1:nrow(mu_complex_norm_all)) {
abun_complex <- c(abun_complex, mu_complex_norm_all[i,j])
}
}
line_names <- c(rep(rownames(mu_complex_norm_all), times = ncol(mu_complex_norm_all) ))
kmer_names <- c()
for (i in 1:ncol(mu_complex_norm_all)) {
kmer_names <- c(kmer_names, c(rep(colnames(mu_complex_norm_all)[i], times = nrow(mu_complex_norm_all))))
}
#now make a dataframe
complex_df <- data.frame(line_names, kmer_names, abun_complex)
complex_df$type <- "complex"
head(complex_df)
## line_names kmer_names abun_complex type
## 1 B6JEiJ cn_major -0.0000916069 complex
## 2 B10ScNHsd cn_major 0.0001069893 complex
## 3 B6NHsd cn_major 0.0009983401 complex
## 4 B6NTac cn_major 0.0003907011 complex
## 5 B6NJ cn_major -0.0015786951 complex
## 6 B6NTyrCBrdCrlCrl cn_major 0.0012629646 complex
colnames(complex_df) <- c("line_names", "kmer_names", "abun", "type")
colnames(hypermutator_df) <- c("line_names", "kmer_names", "abun", "type")
perline_boxplot_df <- rbind(hypermutator_df, complex_df)
#perline_boxplot_df$line_names <- factor(perline_boxplot_df$line_names, levels = c(""))
## make the plot including mu_complex
ggplot(data = perline_boxplot_df, aes(x = line_names, y = abun)) +
geom_boxplot(outlier.shape = NA, width=0.7) +
#geom_point(position = position_jitter(w = 0.5, h = 0, alpha = 0.6, color=type)) +
geom_jitter(aes(color=type, alpha=0.6)) +
theme_bw() +
theme(text = element_text(size=14), axis.text.x = element_text(angle = 30, hjust = 1))
* Investigate the correlation between kmers *
library(corrplot)
mu_complex_norm2 <- cbind(mu_complex_norm, mu_Ymin_norm)
combined_mu_norm <- cbind(mu_complex_norm2, mu_common_norm)
combined_mu_norm[,1:10]
## cn_major cn_minor cn_HOR_Ymin AC
## B6JEiJ -9.160690e-05 -1.615809e-04 1.635807e-04 -1.665755e-04
## B10ScNHsd 1.069893e-04 -7.387808e-05 3.844343e-04 -1.037055e-04
## B6NHsd 9.983401e-04 -3.696561e-05 -1.124882e-03 -1.520792e-04
## B6NTac 3.907011e-04 2.792860e-04 9.498703e-04 4.070876e-05
## B6NJ -1.578695e-03 -4.101223e-04 -3.182984e-03 -1.399663e-04
## B6NTyrCBrdCrlCrl 1.262965e-03 5.102066e-04 2.689239e-03 4.700042e-04
## B6JBomTac -5.087033e-04 2.367385e-04 5.620620e-04 1.526023e-04
## B6J 4.575415e-04 -1.616087e-04 -4.880171e-04 9.140961e-05
## B10ScCr 1.728454e-04 -1.568226e-04 -9.187696e-04 2.490011e-04
## B10SnJ -2.893223e-04 -8.429768e-05 8.281188e-05 4.230222e-04
## B6ByJ -1.255219e-04 -8.906739e-05 1.134649e-04 -3.731076e-04
## B6NCrl -7.866809e-04 -2.184930e-04 3.346990e-04 -1.704128e-04
## B10ScSnJ -1.597972e-04 6.259891e-06 1.384399e-05 -2.642960e-04
## B10J 7.632285e-05 3.422101e-04 3.206053e-04 -9.510261e-05
## AG AAG AACCCT AGAT
## B6JEiJ 4.075430e-04 -0.0016936235 -0.0004374261 -0.0036641401
## B10ScNHsd 5.307961e-04 0.0010510781 -0.0008913244 0.0015920712
## B6NHsd -1.564207e-03 -0.0033459144 -0.0009501471 -0.0043442036
## B6NTac 1.091716e-03 0.0012808578 -0.0011192028 0.0009625905
## B6NJ 4.652882e-04 0.0004670982 -0.0021052905 0.0017003030
## B6NTyrCBrdCrlCrl -4.409409e-04 -0.0009093677 0.0033453313 -0.0017989410
## B6JBomTac 1.838519e-04 0.0005484810 0.0011256591 0.0001359405
## B6J 4.974917e-05 0.0005808924 0.0004389475 0.0012644830
## B10ScCr -7.222850e-04 -0.0020978667 0.0001943518 -0.0025471971
## B10SnJ -5.365049e-04 0.0016646539 0.0027878050 0.0037813664
## B6ByJ 3.595034e-04 0.0007397267 -0.0014161398 0.0005660927
## B6NCrl 9.820323e-05 0.0005409763 0.0001317758 0.0005682900
## B10ScSnJ -6.374373e-05 0.0002449262 -0.0012163915 -0.0002705450
## B10J 6.792768e-05 0.0002107909 -0.0005693525 0.0004712985
## AGAGGC AAAG
## B6JEiJ -3.605975e-04 -1.514244e-03
## B10ScNHsd -3.000287e-04 1.043085e-03
## B6NHsd 2.057965e-06 -2.464712e-03
## B6NTac 2.326053e-04 1.582728e-03
## B6NJ -2.519822e-05 8.803666e-04
## B6NTyrCBrdCrlCrl 2.586653e-04 -1.402117e-03
## B6JBomTac 2.273234e-04 -1.279014e-06
## B6J 9.134611e-05 4.982097e-04
## B10ScCr 2.915746e-04 -1.573901e-03
## B10SnJ 5.069733e-04 5.719602e-04
## B6ByJ -4.350530e-04 4.967773e-04
## B6NCrl -1.447509e-04 6.199326e-04
## B10ScSnJ -2.517623e-04 3.594212e-04
## B10J -1.601563e-04 4.671663e-04
cor_mu <- cor(combined_mu_norm)
par(mar=c(3,3,10,2))
corrplot(cor_mu, type="upper", order="original", tl.col="black", tl.cex=0.5, tl.srt=90)
#install.packages("Hmisc")
library("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:Biostrings':
##
## mask, translate
## The following object is masked from 'package:ape':
##
## zoom
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
res2<-rcorr(combined_mu_norm)
res2.2 <- flattenCorrMatrix(res2$r, res2$P)
head(res2.2)
## row column cor p
## 1 cn_major cn_minor 0.5961939 0.0244345812
## 2 cn_major cn_HOR_Ymin 0.5728026 0.0322751053
## 3 cn_minor cn_HOR_Ymin 0.7871867 0.0008331837
## 4 cn_major AC 0.3384863 0.2365039854
## 5 cn_minor AC 0.4020968 0.1540876291
## 6 cn_HOR_Ymin AC 0.3735314 0.1883233036
sig_cor <- subset(res2.2, p<0.05)
nrow(res2.2)
## [1] 2145
nrow(sig_cor)
## [1] 609
nrow(sig_cor[which(sig_cor$cor > 0), ])
## [1] 475
nrow(sig_cor[which(sig_cor$cor < 0), ])
## [1] 134
strong_cor <- subset(res2.2, cor>0.95)
strong_neg_cor <- subset(res2.2, cor<(-0.9))
nrow(sig_cor)
## [1] 609
sig_cor
## row column cor p
## 1 cn_major cn_minor 0.5961939 2.443458e-02
## 2 cn_major cn_HOR_Ymin 0.5728026 3.227511e-02
## 3 cn_minor cn_HOR_Ymin 0.7871867 8.331837e-04
## 15 AG AAG 0.6775671 7.755117e-03
## 18 cn_HOR_Ymin AACCCT 0.5924363 2.558601e-02
## 19 AC AACCCT 0.8693569 5.385384e-05
## 27 AAG AGAT 0.9429372 4.404587e-07
## 32 AC AGAGGC 0.8894076 2.073145e-05
## 35 AACCCT AGAGGC 0.6663045 9.269577e-03
## 41 AG AAAG 0.7544191 1.821581e-03
## 42 AAG AAAG 0.9429756 4.387182e-07
## 44 AGAT AAAG 0.8874420 2.294166e-05
## 49 AC AGG 0.8429168 1.532526e-04
## 52 AACCCT AGG 0.8373350 1.865758e-04
## 54 AGAGGC AGG 0.7269247 3.225056e-03
## 69 cn_HOR_Ymin AAGGAG 0.5433964 4.460971e-02
## 71 AG AAGGAG 0.5969839 2.419754e-02
## 72 AAG AAGGAG 0.7995362 5.990438e-04
## 74 AGAT AAGGAG 0.6770462 7.820620e-03
## 76 AAAG AAGGAG 0.6188191 1.830330e-02
## 77 AGG AAGGAG 0.6431434 1.309396e-02
## 83 AG AAGG 0.8536113 1.028616e-04
## 84 AAG AAGG 0.8355912 1.981108e-04
## 86 AGAT AAGG 0.7092056 4.506670e-03
## 88 AAAG AAGG 0.7855805 8.683671e-04
## 91 AAGGAG AAGG 0.8616187 7.474152e-05
## 95 AC ACC 0.7646111 1.446958e-03
## 98 AACCCT ACC 0.8665906 6.068786e-05
## 100 AGAGGC ACC 0.6491285 1.200722e-02
## 102 AGG ACC 0.9168749 3.974093e-06
## 109 AC ATCC 0.7771135 1.074017e-03
## 112 AACCCT ATCC 0.7878102 8.198365e-04
## 113 AGAT ATCC 0.5527153 4.038039e-02
## 114 AGAGGC ATCC 0.6698154 8.774933e-03
## 116 AGG ATCC 0.9325039 1.178976e-06
## 118 AAGGAG ATCC 0.6961116 5.686714e-03
## 120 ACC ATCC 0.8815212 3.079216e-05
## 125 AG AAAGG 0.8640291 6.763051e-05
## 126 AAG AAAGG 0.8948035 1.554274e-05
## 128 AGAT AAAGG 0.7591261 1.640045e-03
## 130 AAAG AAAGG 0.8452743 1.407147e-04
## 133 AAGGAG AAAGG 0.8298590 2.401628e-04
## 134 AAGG AAAGG 0.9659691 2.083762e-08
## 141 AG ACAG 0.8440128 1.473157e-04
## 142 AAG ACAG 0.5771353 3.069849e-02
## 146 AAAG ACAG 0.6633226 9.706363e-03
## 149 AAGGAG ACAG 0.5460859 4.335732e-02
## 150 AAGG ACAG 0.6929443 6.005353e-03
## 153 AAAGG ACAG 0.6919266 6.110638e-03
## 154 cn_major AAGTCTGCACACACATACT 0.5574111 3.836459e-02
## 157 AC AAGTCTGCACACACATACT 0.8138235 3.973446e-04
## 158 AG AAGTCTGCACACACATACT -0.6530048 1.134122e-02
## 160 AACCCT AAGTCTGCACACACATACT 0.9066425 7.796126e-06
## 162 AGAGGC AAGTCTGCACACACATACT 0.6767620 7.856546e-03
## 164 AGG AAGTCTGCACACACATACT 0.7636160 1.480572e-03
## 168 ACC AAGTCTGCACACACATACT 0.8603169 7.882775e-05
## 169 ATCC AAGTCTGCACACACATACT 0.6407927 1.354079e-02
## 176 AG ACCT 0.8952618 1.515641e-05
## 177 AAG ACCT 0.8958921 1.463790e-05
## 179 AGAT ACCT 0.7367723 2.648840e-03
## 181 AAAG ACCT 0.8836579 2.773969e-05
## 184 AAGGAG ACCT 0.8226237 3.032524e-04
## 185 AAGG ACCT 0.9209232 2.971879e-06
## 188 AAAGG ACCT 0.9484069 2.435194e-07
## 189 ACAG ACCT 0.8299220 2.396642e-04
## 193 cn_HOR_Ymin ACACAG 0.5625084 3.626140e-02
## 194 AC ACACAG 0.8124835 4.135323e-04
## 197 AACCCT ACACAG 0.7079737 4.608720e-03
## 199 AGAGGC ACACAG 0.7499037 2.010374e-03
## 201 AGG ACACAG 0.7924395 7.260019e-04
## 205 ACC ACACAG 0.6464933 1.247683e-02
## 206 ATCC ACACAG 0.5919067 2.575151e-02
## 209 AAGTCTGCACACACATACT ACACAG 0.6963836 5.659978e-03
## 216 AAG AT 0.8678601 5.746876e-05
## 218 AGAT AT 0.9335224 1.078596e-06
## 220 AAAG AT 0.7113482 4.333413e-03
## 223 AAGGAG AT 0.7371431 2.628854e-03
## 224 AAGG AT 0.6745085 8.145893e-03
## 226 ATCC AT 0.7349683 2.747793e-03
## 227 AAAGG AT 0.7231566 3.469960e-03
## 230 ACCT AT 0.6255011 1.673849e-02
## 232 cn_major AAGAG -0.5415476 4.548580e-02
## 236 AG AAGAG 0.8241880 2.885918e-04
## 237 AAG AAGAG 0.9416123 5.040257e-07
## 239 AGAT AAGAG 0.8551208 9.699140e-05
## 241 AAAG AAGAG 0.8966696 1.401845e-05
## 244 AAGGAG AAGAG 0.7995170 5.993607e-04
## 245 AAGG AAGAG 0.9294186 1.531169e-06
## 248 AAAGG AAGAG 0.9757621 2.778423e-09
## 249 ACAG AAGAG 0.6358095 1.452646e-02
## 251 ACCT AAGAG 0.9370874 7.809699e-07
## 253 AT AAGAG 0.8055010 5.067089e-04
## 258 AG AAGGG 0.5466281 4.310799e-02
## 266 AAGGAG AAGGG 0.5684020 3.393717e-02
## 267 AAGG AAGGG 0.6538637 1.119757e-02
## 271 ACAG AAGGG 0.6072499 2.127173e-02
## 274 ACACAG AAGGG 0.6271593 1.636644e-02
## 285 AGAGGC AGAGAGGC 0.6353183 1.462651e-02
## 297 ACACAG AGAGAGGC 0.7173135 3.878414e-03
## 300 AAGGG AGAGAGGC 0.7838064 9.086015e-04
## 306 AAG ACATAT 0.8447661 1.433445e-04
## 308 AGAT ACATAT 0.8424598 1.557848e-04
## 310 AAAG ACATAT 0.7633388 1.490043e-03
## 313 AAGGAG ACATAT 0.7917486 7.394321e-04
## 314 AAGG ACATAT 0.6992994 5.379474e-03
## 316 ATCC ACATAT 0.5815215 2.916145e-02
## 317 AAAGG ACATAT 0.6805089 7.393133e-03
## 320 ACCT ACATAT 0.7220530 3.544392e-03
## 322 AT ACATAT 0.7745860 1.142399e-03
## 323 AAGAG ACATAT 0.7141328 4.116093e-03
## 337 ACAGC ACAGACAGCACAGCACAGC 0.9932645 1.328715e-12
## 353 cn_minor AGC 0.6080687 2.105044e-02
## 354 cn_HOR_Ymin AGC 0.5487262 4.215305e-02
## 382 AC AGAGG 0.5781589 3.033451e-02
## 384 AAG AGAGG 0.7480044 2.094276e-03
## 386 AGAT AGAGG 0.7893651 7.872956e-04
## 387 AGAGGC AGAGG 0.5708973 3.298711e-02
## 388 AAAG AGAGG 0.6466368 1.245089e-02
## 389 AGG AGAGG 0.5952743 2.471271e-02
## 391 AAGGAG AGAGG 0.7680748 1.334663e-03
## 392 AAGG AGAGG 0.7991167 6.060123e-04
## 394 ATCC AGAGG 0.7420271 2.376562e-03
## 395 AAAGG AGAGG 0.7047049 4.888267e-03
## 398 ACCT AGAGG 0.6545014 1.109183e-02
## 400 AT AGAGG 0.7760032 1.103643e-03
## 401 AAGAG AGAGG 0.7281660 3.147433e-03
## 402 AAGGG AGAGG 0.6328730 1.513241e-02
## 404 ACATAT AGAGG 0.8173272 3.574155e-04
## 436 cn_major AAAAACAAGGGAGATAT 0.5407722 4.585699e-02
## 440 AG AAAAACAAGGGAGATAT -0.9375549 7.475579e-07
## 441 AAG AAAAACAAGGGAGATAT -0.7855231 8.696458e-04
## 443 AGAT AAAAACAAGGGAGATAT -0.6308160 1.556820e-02
## 445 AAAG AAAAACAAGGGAGATAT -0.8331100 2.155119e-04
## 448 AAGGAG AAAAACAAGGGAGATAT -0.6015439 2.286288e-02
## 449 AAGG AAAAACAAGGGAGATAT -0.8679448 5.725925e-05
## 452 AAAGG AAAAACAAGGGAGATAT -0.9343189 1.005095e-06
## 453 ACAG AAAAACAAGGGAGATAT -0.7646264 1.446444e-03
## 454 AAGTCTGCACACACATACT AAAAACAAGGGAGATAT 0.6316706 1.538601e-02
## 455 ACCT AAAAACAAGGGAGATAT -0.8996703 1.182541e-05
## 458 AAGAG AAAAACAAGGGAGATAT -0.9112247 5.823148e-06
## 466 cn_major AGGC 0.5469852 4.294434e-02
## 467 cn_minor AGGC 0.5649979 3.526571e-02
## 468 cn_HOR_Ymin AGGC 0.6404235 1.361201e-02
## 469 AC AGGC 0.5374873 4.745394e-02
## 470 AG AGGC -0.5500716 4.154896e-02
## 472 AACCCT AGGC 0.7835755 9.139447e-04
## 480 ACC AGGC 0.6854910 6.810130e-03
## 484 AAGTCTGCACACACATACT AGGC 0.8259637 2.726498e-04
## 508 ACAGC ACAGACAGCACAGC 0.9944554 4.145573e-13
## 523 ACAGACAGCACAGCACAGC ACAGACAGCACAGC 0.9981677 4.440892e-16
## 553 AGAGAGGC AACC 0.6477331 1.225416e-02
## 565 AC AAAAG -0.6445830 1.282595e-02
## 568 AACCCT AAAAG -0.8319098 2.243619e-04
## 572 AGG AAAAG -0.7348410 2.754883e-03
## 576 ACC AAAAG -0.8445019 1.447274e-04
## 577 ATCC AAAAG -0.8182549 3.474049e-04
## 580 AAGTCTGCACACACATACT AAAAG -0.7349647 2.747993e-03
## 583 AT AAAAG -0.5905381 2.618288e-02
## 593 AGGC AAAAG -0.6866443 6.680409e-03
## 600 AG AAAGAG 0.6005715 2.314273e-02
## 601 AAG AAAGAG 0.9750127 3.329960e-09
## 603 AGAT AAAGAG 0.9664049 1.930525e-08
## 605 AAAG AAAGAG 0.8978024 1.315450e-05
## 608 AAGGAG AAAGAG 0.7679822 1.337571e-03
## 609 AAGG AAAGAG 0.7839200 9.059815e-04
## 611 ATCC AAAGAG 0.5368995 4.774390e-02
## 612 AAAGG AAAGAG 0.8509897 1.137408e-04
## 615 ACCT AAAGAG 0.8410475 1.638249e-04
## 617 AT AAAGAG 0.9183877 3.571354e-06
## 618 AAGAG AAAGAG 0.9283130 1.676735e-06
## 621 ACATAT AAAGAG 0.7952075 6.741465e-04
## 624 AGAGG AAAGAG 0.7455215 2.208108e-03
## 626 AAAAACAAGGGAGATAT AAAGAG -0.7285579 3.123237e-03
## 636 AAG AGATAT 0.7239813 3.415155e-03
## 638 AGAT AGATAT 0.7883346 8.087444e-04
## 640 AAAG AGATAT 0.5990864 2.357512e-02
## 643 AAGGAG AGATAT 0.6167043 1.882094e-02
## 645 ACC AGATAT 0.5530898 4.021686e-02
## 646 ATCC AGATAT 0.5878097 2.705898e-02
## 652 AT AGATAT 0.8009872 5.754294e-04
## 653 AAGAG AGATAT 0.5679404 3.411511e-02
## 656 ACATAT AGATAT 0.7413573 2.409980e-03
## 666 AAAGAG AGATAT 0.7675353 1.351680e-03
## 670 AC AGCAGG 0.9276230 1.773226e-06
## 671 AG AGCAGG -0.5706962 3.306297e-02
## 673 AACCCT AGCAGG 0.9018867 1.039328e-05
## 675 AGAGGC AGCAGG 0.8240837 2.895517e-04
## 677 AGG AGCAGG 0.8125791 4.123605e-04
## 681 ACC AGCAGG 0.8641457 6.730102e-05
## 682 ATCC AGCAGG 0.7542010 1.830363e-03
## 685 AAGTCTGCACACACATACT AGCAGG 0.9269506 1.871623e-06
## 687 ACACAG AGCAGG 0.7016500 5.161359e-03
## 698 AGGC AGCAGG 0.7202911 3.665806e-03
## 701 AAAAG AGCAGG -0.7187996 3.771126e-03
## 709 AAG ACAGAT 0.8741481 4.350596e-05
## 711 AGAT ACAGAT 0.8487740 1.236480e-04
## 713 AAAG ACAGAT 0.6843178 6.944081e-03
## 714 AGG ACAGAT 0.6126558 1.984267e-02
## 716 AAGGAG ACAGAT 0.8925574 1.755491e-05
## 717 AAGG ACAGAT 0.7845245 8.921394e-04
## 719 ATCC ACAGAT 0.7605322 1.588702e-03
## 720 AAAGG ACAGAT 0.8168332 3.628408e-04
## 723 ACCT ACAGAT 0.7569590 1.721745e-03
## 725 AT ACAGAT 0.9304533 1.404532e-06
## 726 AAGAG ACAGAT 0.8587379 8.402620e-05
## 729 ACATAT ACAGAT 0.7411472 2.420537e-03
## 732 AGAGG ACAGAT 0.7729729 1.187828e-03
## 734 AAAAACAAGGGAGATAT ACAGAT -0.6012655 2.294275e-02
## 738 AAAAG ACAGAT -0.5874160 2.718718e-02
## 739 AAAGAG ACAGAT 0.9172323 3.875745e-06
## 740 AGATAT ACAGAT 0.7346771 2.764035e-03
## 745 AC AAAGGG 0.5871851 2.726261e-02
## 747 AAG AAAGGG 0.6119585 2.002281e-02
## 748 AACCCT AAAGGG 0.6040670 2.214864e-02
## 749 AGAT AAAGGG 0.5410206 4.573782e-02
## 752 AGG AAAGGG 0.8404142 1.675382e-04
## 754 AAGGAG AAAGGG 0.9223098 2.680798e-06
## 755 AAGG AAAGGG 0.6496743 1.191166e-02
## 756 ACC AAAGGG 0.6598834 1.022965e-02
## 757 ATCC AAAGGG 0.7880314 8.151430e-04
## 758 AAAGG AAAGGG 0.5782123 3.031561e-02
## 761 ACCT AAAGGG 0.6242475 1.702403e-02
## 762 ACACAG AAAGGG 0.7192244 3.740889e-03
## 763 AT AAAGGG 0.6274207 1.630836e-02
## 764 AAGAG AAAGGG 0.5723129 3.245700e-02
## 765 AAGGG AAAGGG 0.5605297 3.706746e-02
## 767 ACATAT AAAGGG 0.7068125 4.706549e-03
## 770 AGAGG AAAGGG 0.6930870 5.990707e-03
## 776 AAAAG AAAGGG -0.5648877 3.530935e-02
## 777 AAAGAG AAAGGG 0.6226334 1.739708e-02
## 778 AGATAT AAAGGG 0.6402170 1.365196e-02
## 780 ACAGAT AAAGGG 0.8067144 4.894099e-04
## 785 AG AACAAG 0.6144484 1.938522e-02
## 786 AAG AACAAG 0.8438155 1.483706e-04
## 788 AGAT AACAAG 0.7266262 3.243948e-03
## 790 AAAG AACAAG 0.7047577 4.883648e-03
## 793 AAGGAG AACAAG 0.8672127 5.909245e-05
## 794 AAGG AACAAG 0.8316526 2.262956e-04
## 796 ATCC AACAAG 0.5616071 3.662695e-02
## 797 AAAGG AACAAG 0.8778710 3.663900e-05
## 800 ACCT AACAAG 0.7959779 6.602561e-04
## 802 AT AACAAG 0.7856132 8.676403e-04
## 803 AAGAG AACAAG 0.8483200 1.257619e-04
## 806 ACATAT AACAAG 0.6941714 5.880298e-03
## 809 AGAGG AACAAG 0.6391303 1.386374e-02
## 811 AAAAACAAGGGAGATAT AACAAG -0.7042463 4.928523e-03
## 816 AAAGAG AACAAG 0.8107855 4.348054e-04
## 817 AGATAT AACAAG 0.6386746 1.395328e-02
## 819 ACAGAT AACAAG 0.8664931 6.094094e-05
## 820 AAAGGG AACAAG 0.6787491 7.608062e-03
## 832 ACAGC AAAAAG 0.5827540 2.874011e-02
## 847 ACAGACAGCACAGCACAGC AAAAAG 0.5930627 2.539130e-02
## 853 ACAGACAGCACAGC AAAAAG 0.5978586 2.393712e-02
## 865 AC AGGAGGC 0.7933153 7.092612e-04
## 866 AG AGGAGGC -0.6038391 2.221245e-02
## 868 AACCCT AGGAGGC 0.8417580 1.597394e-04
## 870 AGAGGC AGGAGGC 0.7396344 2.497651e-03
## 872 AGG AGGAGGC 0.8189203 3.403647e-04
## 876 ACC AGGAGGC 0.9263603 1.961665e-06
## 877 ATCC AGGAGGC 0.8143158 3.915267e-04
## 879 ACAG AGGAGGC -0.5632206 3.597447e-02
## 880 AAGTCTGCACACACATACT AGGAGGC 0.8643840 6.663179e-05
## 882 ACACAG AGGAGGC 0.5368358 4.777543e-02
## 893 AGGC AGGAGGC 0.6661019 9.298769e-03
## 896 AAAAG AGGAGGC -0.7875134 8.261672e-04
## 899 AGCAGG AGGAGGC 0.9277953 1.748718e-06
## 907 AC AAGC 0.5468783 4.299327e-02
## 929 ACATAT AAGC 0.5886910 2.677364e-02
## 932 AGAGG AAGC 0.6586665 1.041991e-02
## 945 AAAAAG AAGC -0.6964421 5.654243e-03
## 951 AG AGAGGG 0.7524754 1.901042e-03
## 953 AACCCT AGAGGG -0.5985924 2.372026e-02
## 961 ACC AGAGGG -0.6890528 6.415719e-03
## 964 ACAG AGAGGG 0.6819058 7.225906e-03
## 965 AAGTCTGCACACACATACT AGAGGG -0.7118596 4.292847e-03
## 970 AAGGG AGAGGG 0.5353358 4.852159e-02
## 977 AAAAACAAGGGAGATAT AGAGGG -0.6405265 1.359210e-02
## 978 AGGC AGAGGG -0.7371386 2.629101e-03
## 981 AAAAG AGAGGG 0.6078034 2.112194e-02
## 984 AGCAGG AGAGGG -0.6226417 1.739514e-02
## 989 AGGAGGC AGAGGG -0.6992378 5.385288e-03
## 994 AC AGAGAGGCAGAGGC 0.6359067 1.450674e-02
## 999 AGAGGC AGAGAGGCAGAGGC 0.7073080 4.664606e-03
## 1011 ACACAG AGAGAGGCAGAGGC 0.7547257 1.809294e-03
## 1014 AAGGG AGAGAGGCAGAGGC 0.7830560 9.260614e-04
## 1015 AGAGAGGC AGAGAGGCAGAGGC 0.8991328 1.219603e-05
## 1016 ACATAT AGAGAGGCAGAGGC 0.5445738 4.405824e-02
## 1019 AGAGG AGAGAGGCAGAGGC 0.6400548 1.368341e-02
## 1024 AACC AGAGAGGCAGAGGC 0.6283393 1.610553e-02
## 1034 AAGC AGAGAGGCAGAGGC 0.6763580 7.907825e-03
## 1039 AC AAAGC -0.5354158 4.848161e-02
## 1042 AACCCT AAAGC -0.7733716 1.176466e-03
## 1046 AGG AAAGC -0.7933512 7.085817e-04
## 1050 ACC AAAGC -0.9368645 7.973279e-07
## 1051 ATCC AAAGC -0.7580672 1.679572e-03
## 1054 AAGTCTGCACACACATACT AAAGC -0.8000113 5.912269e-04
## 1067 AGGC AAAGC -0.7378471 2.591239e-03
## 1070 AAAAG AAAGC 0.8263103 2.696227e-04
## 1072 AGATAT AAAGC -0.5795735 2.983683e-02
## 1073 AGCAGG AAAGC -0.7001747 5.297430e-03
## 1075 AAAGGG AAAGC -0.5941224 2.506443e-02
## 1078 AGGAGGC AAAGC -0.8031011 5.423710e-04
## 1080 AGAGGG AAAGC 0.7590949 1.641196e-03
## 1085 AC ACAGGC 0.8826218 2.918734e-05
## 1088 AACCCT ACAGGC 0.8910874 1.898370e-05
## 1090 AGAGGC ACAGGC 0.8170589 3.603539e-04
## 1092 AGG ACAGGC 0.9314514 1.290681e-06
## 1096 ACC ACAGGC 0.9257018 2.066297e-06
## 1097 ATCC ACAGGC 0.8311918 2.297953e-04
## 1100 AAGTCTGCACACACATACT ACAGGC 0.8999953 1.160587e-05
## 1102 ACACAG ACAGGC 0.7744655 1.145743e-03
## 1113 AGGC ACAGGC 0.6088313 2.084589e-02
## 1116 AAAAG ACAGGC -0.7690186 1.305303e-03
## 1119 AGCAGG ACAGGC 0.9092562 6.613143e-06
## 1121 AAAGGG ACAGGC 0.6449852 1.275183e-02
## 1124 AGGAGGC ACAGGC 0.9127137 5.278559e-06
## 1128 AAAGC ACAGGC -0.7912819 7.486156e-04
## 1132 AC AAACAAGGGAGATAT 0.6084426 2.094998e-02
## 1133 AG AAACAAGGGAGATAT -0.8091731 4.558124e-04
## 1135 AACCCT AAACAAGGGAGATAT 0.6221313 1.751438e-02
## 1137 AGAGGC AAACAAGGGAGATAT 0.6020888 2.270720e-02
## 1138 AAAG AAACAAGGGAGATAT -0.5355607 4.840920e-02
## 1139 AGG AAACAAGGGAGATAT 0.6367221 1.434199e-02
## 1143 ACC AAACAAGGGAGATAT 0.7933097 7.093683e-04
## 1145 AAAGG AAACAAGGGAGATAT -0.6055257 2.174342e-02
## 1146 ACAG AAACAAGGGAGATAT -0.6655918 9.372561e-03
## 1147 AAGTCTGCACACACATACT AAACAAGGGAGATAT 0.8456324 1.388854e-04
## 1148 ACCT AAACAAGGGAGATAT -0.5752445 3.137931e-02
## 1149 ACACAG AAACAAGGGAGATAT 0.5656577 3.500526e-02
## 1151 AAGAG AAACAAGGGAGATAT -0.5477510 4.259498e-02
## 1159 AAAAACAAGGGAGATAT AAACAAGGGAGATAT 0.8375645 1.850990e-04
## 1160 AGGC AAACAAGGGAGATAT 0.5969336 2.421259e-02
## 1166 AGCAGG AAACAAGGGAGATAT 0.7976856 6.302822e-04
## 1171 AGGAGGC AAACAAGGGAGATAT 0.8220631 3.086498e-04
## 1173 AGAGGG AAACAAGGGAGATAT -0.7457768 2.196186e-03
## 1175 AAAGC AAACAAGGGAGATAT -0.7332943 2.842199e-03
## 1176 ACAGGC AAACAAGGGAGATAT 0.7826228 9.362629e-04
## 1210 AACC ACAGAG -0.6311454 1.549779e-02
## 1246 ACACAG ACAT 0.6103133 2.045271e-02
## 1249 AAGGG ACAT 0.6254250 1.675572e-02
## 1250 AGAGAGGC ACAT 0.8015132 5.670560e-04
## 1251 ACATAT ACAT 0.6028462 2.249210e-02
## 1271 AGAGAGGCAGAGGC ACAT 0.6745660 8.138411e-03
## 1281 AAG ACT 0.7468601 2.146144e-03
## 1283 AGAT ACT 0.7689318 1.307979e-03
## 1286 AGG ACT 0.6367597 1.433443e-02
## 1288 AAGGAG ACT 0.7841892 8.997957e-04
## 1289 AAGG ACT 0.6578432 1.055017e-02
## 1290 ACC ACT 0.5833720 2.853053e-02
## 1291 ATCC ACT 0.8015557 5.663847e-04
## 1292 AAAGG ACT 0.7058145 4.791924e-03
## 1295 ACCT ACT 0.5833842 2.852641e-02
## 1297 AT ACT 0.9178403 3.713009e-06
## 1298 AAGAG ACT 0.7436363 2.297772e-03
## 1301 ACATAT ACT 0.6075221 2.119797e-02
## 1304 AGAGG ACT 0.6992488 5.384248e-03
## 1310 AAAAG ACT -0.7020574 5.124267e-03
## 1311 AAAGAG ACT 0.8238590 2.916269e-04
## 1312 AGATAT ACT 0.6581630 1.049943e-02
## 1314 ACAGAT ACT 0.9571777 8.115607e-08
## 1315 AAAGGG ACT 0.7140482 4.122566e-03
## 1316 AACAAG ACT 0.7989392 6.089814e-04
## 1322 AAAGC ACT -0.5488524 4.209615e-02
## 1332 AAG AGAGAT 0.8505713 1.155604e-04
## 1334 AGAT AGAGAT 0.8985417 1.261457e-05
## 1336 AAAG AGAGAT 0.6793555 7.533466e-03
## 1337 AGG AGAGAT 0.5603979 3.712159e-02
## 1339 AAGGAG AGAGAT 0.7214123 3.588170e-03
## 1340 AAGG AGAGAT 0.5775631 3.054595e-02
## 1341 ACC AGAGAT 0.5742354 3.174723e-02
## 1342 ATCC AGAGAT 0.7500078 2.005853e-03
## 1343 AAAGG AGAGAT 0.6407042 1.355783e-02
## 1346 ACCT AGAGAT 0.5915087 2.587640e-02
## 1348 AT AGAGAT 0.9577740 7.470393e-08
## 1349 AAGAG AGAGAT 0.7342569 2.787603e-03
## 1352 ACATAT AGAGAT 0.7493438 2.034826e-03
## 1355 AGAGG AGAGAT 0.6866842 6.675962e-03
## 1361 AAAAG AGAGAT -0.6021450 2.269117e-02
## 1362 AAAGAG AGAGAT 0.9019679 1.034362e-05
## 1363 AGATAT AGAGAT 0.8797358 3.354841e-05
## 1365 ACAGAT AGAGAT 0.9135069 5.006033e-06
## 1366 AAAGGG AGAGAT 0.6635262 9.676048e-03
## 1367 AACAAG AGAGAT 0.7853398 8.737399e-04
## 1373 AAAGC AGAGAT -0.5718297 3.263726e-02
## 1378 ACT AGAGAT 0.8795980 3.376915e-05
## 1402 AAGGG AGGG 0.6876542 6.568411e-03
## 1403 AGAGAGGC AGGG 0.5434225 4.459741e-02
## 1410 AGGC AGGG -0.6049457 2.190387e-02
## 1423 AGAGGG AGGG 0.7898654 7.770501e-04
## 1425 AAAGC AGGG 0.5403205 4.607420e-02
## 1432 cn_major AAAAAC 0.8440494 1.471210e-04
## 1434 cn_HOR_Ymin AAAAAC 0.5728243 3.226707e-02
## 1436 AG AAAAAC -0.6460600 1.255536e-02
## 1437 AAG AAAAAC -0.6357100 1.454670e-02
## 1439 AGAT AAAAAC -0.6769117 7.837603e-03
## 1441 AAAG AAAAAC -0.7442898 2.266370e-03
## 1445 AAGG AAAAAC -0.5868516 2.737177e-02
## 1448 AAAGG AAAAAC -0.6474621 1.230257e-02
## 1450 AAGTCTGCACACACATACT AAAAAC 0.7008881 5.231285e-03
## 1451 ACCT AAAAAC -0.6108335 2.031602e-02
## 1453 AT AAAAAC -0.5355513 4.841387e-02
## 1454 AAGAG AAAAAC -0.7430779 2.324876e-03
## 1459 AGC AAAAAC 0.5606600 3.701396e-02
## 1462 AAAAACAAGGGAGATAT AAAAAC 0.7523407 1.906649e-03
## 1463 AGGC AAAAAC 0.6790759 7.567792e-03
## 1467 AAAGAG AAAAAC -0.6919680 6.106326e-03
## 1476 AGAGGG AAAAAC -0.6077218 2.114398e-02
## 1480 AAACAAGGGAGATAT AAAAAC 0.6278539 1.621246e-02
## 1490 AG AAGAGG 0.9128313 5.237424e-06
## 1491 AAG AAGAGG 0.8298571 2.401773e-04
## 1493 AGAT AAGAGG 0.6549791 1.101312e-02
## 1495 AAAG AAGAGG 0.7979150 6.263407e-04
## 1498 AAGGAG AAGAGG 0.8577825 8.730489e-05
## 1499 AAGG AAGAGG 0.9728022 5.511222e-09
## 1502 AAAGG AAGAGG 0.9607164 4.874380e-08
## 1503 ACAG AAGAGG 0.8047296 5.179592e-04
## 1505 ACCT AAGAGG 0.9617456 4.165895e-08
## 1507 AT AAGAGG 0.5910708 2.601435e-02
## 1508 AAGAG AAGAGG 0.9157132 4.308092e-06
## 1509 AAGGG AAGAGG 0.6324843 1.521404e-02
## 1511 ACATAT AAGAGG 0.6932488 5.974129e-03
## 1514 AGAGG AAGAGG 0.6880932 6.520177e-03
## 1516 AAAAACAAGGGAGATAT AAGAGG -0.8971486 1.364762e-05
## 1521 AAAGAG AAGAGG 0.7555373 1.777079e-03
## 1524 ACAGAT AAGAGG 0.7327439 2.873789e-03
## 1525 AAAGGG AAGAGG 0.6403938 1.361775e-02
## 1526 AACAAG AAGAGG 0.8141717 3.932227e-04
## 1534 AAACAAGGGAGATAT AAGAGG -0.5873821 2.719827e-02
## 1537 ACT AAGAGG 0.5698911 3.336781e-02
## 1541 cn_major AAAGGAAG -0.5516407 4.085245e-02
## 1545 AG AAAGGAAG 0.8562712 9.270294e-05
## 1546 AAG AAAGGAAG 0.9138849 4.880266e-06
## 1548 AGAT AAAGGAAG 0.8045665 5.203635e-04
## 1550 AAAG AAAGGAAG 0.9006218 1.119201e-05
## 1553 AAGGAG AAAGGAAG 0.7321563 2.907825e-03
## 1554 AAGG AAAGGAAG 0.9103519 6.163259e-06
## 1557 AAAGG AAAGGAAG 0.9760673 2.576718e-09
## 1558 ACAG AAAGGAAG 0.6524151 1.144066e-02
## 1560 ACCT AAAGGAAG 0.9206025 3.042773e-06
## 1562 AT AAAGGAAG 0.7439232 2.283945e-03
## 1563 AAGAG AAAGGAAG 0.9819305 4.833791e-10
## 1566 ACATAT AAAGGAAG 0.6583388 1.047161e-02
## 1569 AGAGG AAAGGAAG 0.6397617 1.374038e-02
## 1571 AAAAACAAGGGAGATAT AAAGGAAG -0.9541723 1.211222e-07
## 1576 AAAGAG AAAGGAAG 0.8770980 3.798652e-05
## 1579 ACAGAT AAAGGAAG 0.7813765 9.661139e-04
## 1581 AACAAG AAAGGAAG 0.8389189 1.765755e-04
## 1589 AAACAAGGGAGATAT AAAGGAAG -0.6578506 1.054900e-02
## 1592 ACT AAAGGAAG 0.6699531 8.755969e-03
## 1593 AGAGAT AAAGGAAG 0.6585840 1.043291e-02
## 1595 AAAAAC AAAGGAAG -0.7395101 2.504071e-03
## 1596 AAGAGG AAAGGAAG 0.9109297 5.936315e-06
## 1608 ACAGC AAAAATCTTAAAGG -0.6256377 1.670762e-02
## 1609 AAGGAG AAAAATCTTAAAGG -0.5603924 3.712387e-02
## 1623 ACAGACAGCACAGCACAGC AAAAATCTTAAAGG -0.6290862 1.594203e-02
## 1629 ACAGACAGCACAGC AAAAATCTTAAAGG -0.6154109 1.914291e-02
## 1633 AGATAT AAAAATCTTAAAGG -0.5666919 3.459988e-02
## 1635 ACAGAT AAAAATCTTAAAGG -0.6205377 1.789067e-02
## 1636 AAAGGG AAAAATCTTAAAGG -0.6170336 1.873962e-02
## 1637 AACAAG AAAAATCTTAAAGG -0.5636608 3.579795e-02
## 1643 AAAGC AAAAATCTTAAAGG 0.5909849 2.604145e-02
## 1648 ACT AAAAATCTTAAAGG -0.5772176 3.066908e-02
## 1649 AGAGAT AAAAATCTTAAAGG -0.5947339 2.487727e-02
## 1657 AC ACTGCT 0.6427693 1.316431e-02
## 1660 AACCCT ACTGCT 0.7191759 3.744335e-03
## 1662 AGAGGC ACTGCT 0.5907612 2.611221e-02
## 1664 AGG ACTGCT 0.6649561 9.465163e-03
## 1668 ACC ACTGCT 0.7696240 1.286740e-03
## 1669 ATCC ACTGCT 0.7640271 1.466608e-03
## 1672 AAGTCTGCACACACATACT ACTGCT 0.7096391 4.471186e-03
## 1685 AGGC ACTGCT 0.6611117 1.004032e-02
## 1688 AAAAG ACTGCT -0.7672257 1.361526e-03
## 1691 AGCAGG ACTGCT 0.7913762 7.467516e-04
## 1696 AGGAGGC ACTGCT 0.8736008 4.459848e-05
## 1700 AAAGC ACTGCT -0.6921888 6.083380e-03
## 1701 ACAGGC ACTGCT 0.7267566 3.235687e-03
## 1702 AAACAAGGGAGATAT ACTGCT 0.5855584 2.779824e-02
## 1705 ACT ACTGCT 0.5613406 3.673551e-02
## 1715 AC ACACAT 0.6348328 1.472592e-02
## 1718 AACCCT ACACAT 0.8134960 4.012531e-04
## 1722 AGG ACACAT 0.7911380 7.514649e-04
## 1726 ACC ACACAT 0.8901559 1.993727e-05
## 1727 ATCC ACACAT 0.8666711 6.047954e-05
## 1730 AAGTCTGCACACACATACT ACACAT 0.6926040 6.040406e-03
## 1733 AT ACACAT 0.6657221 9.353679e-03
## 1743 AGGC ACACAT 0.6208681 1.781215e-02
## 1746 AAAAG ACACAT -0.8640085 6.768878e-05
## 1748 AGATAT ACACAT 0.5629358 3.608900e-02
## 1749 AGCAGG ACACAT 0.7263320 3.262647e-03
## 1750 ACAGAT ACACAT 0.6281278 1.615205e-02
## 1751 AAAGGG ACACAT 0.5785985 3.017920e-02
## 1754 AGGAGGC ACACAT 0.8273850 2.604092e-04
## 1756 AGAGGG ACACAT -0.6295187 1.584793e-02
## 1758 AAAGC ACACAT -0.8583684 8.528209e-05
## 1759 ACAGGC ACACAT 0.7511447 1.957010e-03
## 1760 AAACAAGGGAGATAT ACACAT 0.5395026 4.646948e-02
## 1763 ACT ACACAT 0.7530063 1.879073e-03
## 1764 AGAGAT ACACAT 0.6650002 9.458719e-03
## 1770 ACTGCT ACACAT 0.7971878 6.389041e-04
## 1782 ACAGC AAACC -0.8505582 1.156178e-04
## 1797 ACAGACAGCACAGCACAGC AAACC -0.8788170 3.504344e-05
## 1803 ACAGACAGCACAGC AAACC -0.8832293 2.833114e-05
## 1812 AAAAAG AAACC -0.6172248 1.869251e-02
## 1829 ACTGCT AAACC 0.5559035 3.900351e-02
## 1835 AG AAAACC -0.7745204 1.144220e-03
## 1840 AAAG AAAACC -0.6039374 2.218490e-02
## 1844 AAGG AAAACC -0.6064134 2.149961e-02
## 1845 ACC AAAACC 0.6890482 6.416214e-03
## 1847 AAAGG AAAACC -0.6588306 1.039409e-02
## 1848 ACAG AAAACC -0.6303054 1.567784e-02
## 1849 AAGTCTGCACACACATACT AAAACC 0.7560369 1.757473e-03
## 1850 ACCT AAAACC -0.5950317 2.478647e-02
## 1853 AAGAG AAAACC -0.5976837 2.398901e-02
## 1861 AAAAACAAGGGAGATAT AAAACC 0.8380955 1.817188e-04
## 1868 AGCAGG AAAACC 0.6296777 1.581343e-02
## 1873 AGGAGGC AAAACC 0.6758703 7.970071e-03
## 1875 AGAGGG AAAACC -0.7065394 4.729791e-03
## 1877 AAAGC AAAACC -0.6845233 6.920471e-03
## 1878 ACAGGC AAAACC 0.6854976 6.809375e-03
## 1879 AAACAAGGGAGATAT AAAACC 0.9464340 3.037009e-07
## 1885 AAAAAC AAAACC 0.6440681 1.292133e-02
## 1886 AAGAGG AAAACC -0.6212649 1.771821e-02
## 1887 AAAGGAAG AAAACC -0.6953682 5.760291e-03
## 1892 cn_major AAAAACC 0.6687758 8.919233e-03
## 1896 AG AAAAACC -0.7098716 4.452249e-03
## 1897 AAG AAAAACC -0.7919462 7.355705e-04
## 1899 AGAT AAAAACC -0.7275218 3.187534e-03
## 1901 AAAG AAAAACC -0.8412995 1.623661e-04
## 1905 AAGG AAAAACC -0.6938952 5.908266e-03
## 1908 AAAGG AAAAACC -0.8093634 4.532910e-04
## 1909 ACAG AAAAACC -0.5925099 2.556307e-02
## 1910 AAGTCTGCACACACATACT AAAAACC 0.5877763 2.706986e-02
## 1911 ACCT AAAAACC -0.7471097 2.134747e-03
## 1913 AT AAAAACC -0.6117614 2.007395e-02
## 1914 AAGAG AAAAACC -0.8338486 2.102076e-04
## 1922 AAAAACAAGGGAGATAT AAAAACC 0.8896429 2.047902e-05
## 1927 AAAGAG AAAAACC -0.7590166 1.644097e-03
## 1930 ACAGAT AAAAACC -0.5725235 3.237870e-02
## 1932 AACAAG AAAAACC -0.5750942 3.143389e-02
## 1933 AAAAAG AAAAACC 0.5590630 3.767342e-02
## 1940 AAACAAGGGAGATAT AAAAACC 0.7352132 2.734193e-03
## 1946 AAAAAC AAAAACC 0.8175790 3.546762e-04
## 1947 AAGAGG AAAAACC -0.6889266 6.429382e-03
## 1948 AAAGGAAG AAAAACC -0.8791965 3.441943e-05
## 1953 AAAACC AAAAACC 0.7857744 8.640576e-04
## 1958 AG ACACATAT 0.6297079 1.580690e-02
## 1959 AAG ACACATAT 0.9565039 8.899533e-08
## 1961 AGAT ACACATAT 0.9389898 6.522895e-07
## 1963 AAAG ACACATAT 0.8898077 2.030374e-05
## 1966 AAGGAG ACACATAT 0.7701492 1.270810e-03
## 1967 AAGG ACACATAT 0.8668021 6.014169e-05
## 1969 ATCC ACACATAT 0.5536394 3.997771e-02
## 1970 AAAGG ACACATAT 0.8937335 1.647629e-05
## 1973 ACCT ACACATAT 0.8246451 2.844179e-04
## 1975 AT ACACATAT 0.8982272 1.284200e-05
## 1976 AAGAG ACACATAT 0.9307724 1.367266e-06
## 1979 ACATAT ACACATAT 0.8223294 3.060758e-04
## 1982 AGAGG ACACATAT 0.8498136 1.189161e-04
## 1984 AAAAACAAGGGAGATAT ACACATAT -0.7712784 1.237089e-03
## 1989 AAAGAG ACACATAT 0.9341650 1.018970e-06
## 1990 AGATAT ACACATAT 0.6348444 1.472354e-02
## 1992 ACAGAT ACACATAT 0.8603236 7.880632e-05
## 1993 AAAGGG ACACATAT 0.5583915 3.795327e-02
## 1994 AACAAG ACACATAT 0.8228387 3.012022e-04
## 1995 AAAAAG ACACATAT -0.5606745 3.700800e-02
## 1997 AAGC ACACATAT 0.5327225 4.984181e-02
## 2005 ACT ACACATAT 0.7731975 1.181418e-03
## 2006 AGAGAT ACACATAT 0.8268936 2.645899e-04
## 2008 AAAAAC ACACATAT -0.6666895 9.214307e-03
## 2009 AAGAGG ACACATAT 0.8068933 4.868995e-04
## 2010 AAAGGAAG ACACATAT 0.9039513 9.190343e-06
## 2016 AAAAACC ACACATAT -0.8069037 4.867536e-04
## 2020 AC AGCAGGAGGAGG 0.6411136 1.347910e-02
## 2023 AACCCT AGCAGGAGGAGG 0.5498888 4.163065e-02
## 2025 AGAGGC AGCAGGAGGAGG 0.6740693 8.203241e-03
## 2027 AGG AGCAGGAGGAGG 0.6112526 2.020644e-02
## 2035 AAGTCTGCACACACATACT AGCAGGAGGAGG 0.6263436 1.654868e-02
## 2037 ACACAG AGCAGGAGGAGG 0.8404333 1.674247e-04
## 2041 AGAGAGGC AGCAGGAGGAGG 0.6273619 1.632142e-02
## 2054 AGCAGG AGCAGGAGGAGG 0.5843175 2.821209e-02
## 2064 ACAGGC AGCAGGAGGAGG 0.6872426 6.613884e-03
## 2065 AAACAAGGGAGATAT AGCAGGAGGAGG 0.5464055 4.321024e-02
## 2079 AAAAACC AGCAGGAGGAGG 0.5412812 4.561309e-02
## 2086 AAG ACATAG 0.8623949 7.238826e-05
## 2088 AGAT ACATAG 0.8279815 2.554058e-04
## 2090 AAAG ACATAG 0.6902402 6.288259e-03
## 2091 AGG ACATAG 0.6091706 2.075538e-02
## 2093 AAGGAG ACATAG 0.9352899 9.211397e-07
## 2094 AAGG ACATAG 0.8005357 5.826957e-04
## 2096 ATCC ACATAG 0.7216642 3.570910e-03
## 2097 AAAGG ACATAG 0.7968310 6.451415e-04
## 2100 ACCT ACATAG 0.7638464 1.472733e-03
## 2102 AT ACATAG 0.8893262 2.081936e-05
## 2103 AAGAG ACATAG 0.8235778 2.942407e-04
## 2106 ACATAT ACATAG 0.8778584 3.666065e-05
## 2109 AGAGG ACATAG 0.7925484 7.239044e-04
## 2111 AAAAACAAGGGAGATAT ACATAG -0.5698117 3.339801e-02
## 2115 AAAAG ACATAG -0.5438580 4.439291e-02
## 2116 AAAGAG ACATAG 0.8587177 8.409459e-05
## 2117 AGATAT ACATAG 0.7956712 6.657581e-04
## 2119 ACAGAT ACATAG 0.9289660 1.589445e-06
## 2120 AAAGGG ACATAG 0.8508024 1.145527e-04
## 2121 AACAAG ACATAG 0.8504493 1.160956e-04
## 2132 ACT ACATAG 0.8424517 1.558300e-04
## 2133 AGAGAT ACATAG 0.8525818 1.070280e-04
## 2136 AAGAGG ACATAG 0.7781731 1.046341e-03
## 2137 AAAGGAAG ACATAG 0.7576165 1.696622e-03
## 2140 ACACAT ACATAG 0.6048381 2.193375e-02
## 2144 ACACATAT ACATAG 0.8480556 1.270067e-04
# very high sig: AAG ACACATAT 0.9565039 8.899533e-08
#ACAGC ACAGACAGCACAGCACAGC 0.9932645 1.328715e-12
# AAAGG AAAGGAAG 0.9760673 2.576718e-09
#AGAT AAAGAG 0.9664049 1.930525e-08
# ACAGACAGCACAGCACAGC ACAGACAGCACAGC 0.9981677 4.440892e-16
# ACAGC ACAGACAGCACAGC 0.9944554 4.145573e-13
# AAAGG AAGAG 0.9757621 2.778423e-09
# AAG AAAG 0.9429756 4.387182e-07
# complex_repeats:
# cn_major cn_minor 0.596193948 0.02443458
# cn_minor AGC 0.6080686725 0.02105044
# cn_minor AGGC 0.5649979280 0.03526571
# cn_major AAAAACC 0.66877575 0.008919233
# cn_major AAAGGAAG -0.55164066 0.040852452
# cn_major AAAAAC 0.84404938 0.000147121
# cn_major AGGC 0.54698523 0.042944336
# cn_major AAAAACAAGGGAGATAT 0.54077219 0.045856990
# cn_HOR_Ymin AACCCT 0.59243629 0.02558601
# cn_HOR_Ymin AAGGAG 0.54339639 0.04460971
# cn_HOR_Ymin ACACAG 0.56250836 0.03626140
# cn_HOR_Ymin AGC 0.54872625 0.04215305
# cn_HOR_Ymin AGGC 0.64042345 0.01361201
# cn_HOR_Ymin AAAAAC 0.57282428 0.03226707
# cn_major cn_HOR_Ymin 0.57280260 0.032275105
# cn_minor cn_HOR_Ymin 0.7871866529 0.0008331837
res2.2[which(res2.2$row == "cn_minor"),]
## row column cor p
## 3 cn_minor cn_HOR_Ymin 0.7871866529 0.0008331837
## 5 cn_minor AC 0.4020967811 0.1540876291
## 8 cn_minor AG 0.0024032620 0.9934943573
## 12 cn_minor AAG 0.0288507825 0.9220082684
## 17 cn_minor AACCCT 0.4394568064 0.1158949115
## 23 cn_minor AGAT -0.1118349440 0.7034760976
## 30 cn_minor AGAGGC 0.2716551453 0.3474689709
## 38 cn_minor AAAG -0.0701922507 0.8115384962
## 47 cn_minor AGG 0.1795982604 0.5389698138
## 57 cn_minor ACAGC 0.0772760679 0.7928783960
## 68 cn_minor AAGGAG 0.2923860323 0.3103906534
## 80 cn_minor AAGG 0.1174742496 0.6891878515
## 93 cn_minor ACC 0.1795772716 0.5390180412
## 107 cn_minor ATCC 0.1400616173 0.6329575704
## 122 cn_minor AAAGG 0.0587759051 0.8418040937
## 138 cn_minor ACAG 0.3336816629 0.2436471735
## 155 cn_minor AAGTCTGCACACACATACT 0.4065565841 0.1491423268
## 173 cn_minor ACCT 0.0972248304 0.7409090515
## 192 cn_minor ACACAG 0.4963592495 0.0710222977
## 212 cn_minor AT -0.0755744038 0.7973518835
## 233 cn_minor AAGAG -0.0494216306 0.8667571276
## 255 cn_minor AAGGG 0.3050441182 0.2889174740
## 278 cn_minor AGAGAGGC 0.3803112059 0.1797928011
## 302 cn_minor ACATAT 0.3498796788 0.2200848007
## 327 cn_minor ACAGACAGCACAGCACAGC 0.0522718809 0.8591406626
## 353 cn_minor AGC 0.6080686725 0.0210504418
## 380 cn_minor AGAGG 0.1666156976 0.5691529772
## 408 cn_minor AAAGACAGACAG -0.0368941898 0.9003524826
## 437 cn_minor AAAAACAAGGGAGATAT 0.0251312206 0.9320405579
## 467 cn_minor AGGC 0.5649979280 0.0352657133
## 498 cn_minor ACAGACAGCACAGC 0.0603568300 0.8375998717
## 530 cn_minor AACC -0.1560353328 0.5942553371
## 563 cn_minor AAAAG -0.3056937031 0.2878396036
## 597 cn_minor AAAGAG -0.1234548405 0.6741395001
## 632 cn_minor AGATAT -0.1061813248 0.7178924999
## 668 cn_minor AGCAGG 0.3087901482 0.2827340345
## 705 cn_minor ACAGAT -0.0176544987 0.9522335372
## 743 cn_minor AAAGGG 0.2527080346 0.3833912880
## 782 cn_minor AACAAG 0.0912374292 0.7564099844
## 822 cn_minor AAAAAG -0.0687614602 0.8153191057
## 863 cn_minor AGGAGGC 0.0627222736 0.8313168920
## 905 cn_minor AAGC 0.4127161514 0.1424868068
## 948 cn_minor AGAGGG -0.0892099415 0.7616786664
## 992 cn_minor AGAGAGGCAGAGGC 0.4758936279 0.0854177320
## 1037 cn_minor AAAGC -0.1754958763 0.5484316170
## 1083 cn_minor ACAGGC 0.2110608654 0.4688744398
## 1130 cn_minor AAACAAGGGAGATAT 0.0647303468 0.8259904684
## 1178 cn_minor ACAGAG -0.0629743315 0.8306479331
## 1227 cn_minor ACAT 0.3923292393 0.1652928707
## 1277 cn_minor ACT -0.0990550271 0.7361886962
## 1328 cn_minor AGAGAT -0.0816660355 0.7813652633
## 1380 cn_minor AGGG -0.0718793881 0.8070854414
## 1433 cn_minor AAAAAC 0.4943928231 0.0723247562
## 1487 cn_minor AAGAGG 0.1861519545 0.5240036571
## 1542 cn_minor AAAGGAAG -0.0398925747 0.8922954371
## 1598 cn_minor AAAAATCTTAAAGG 0.1589578180 0.5872778141
## 1655 cn_minor ACTGCT 0.0354668725 0.9041910490
## 1713 cn_minor ACACAT 0.0744723396 0.8002521861
## 1772 cn_minor AAACC -0.1186387581 0.6862491310
## 1832 cn_minor AAAACC -0.0002476959 0.9993294795
## 1893 cn_minor AAAAACC 0.0650178278 0.8252284874
## 1955 cn_minor ACACATAT 0.0415966961 0.8877204976
## 2018 cn_minor AGCAGGAGGAGG 0.2382297860 0.4121149488
## 2082 cn_minor ACATAG 0.1873903226 0.5211966624
res2.2[which(res2.2$row == "AAAAACAAGGGAGATAT" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] # 8
## row column cor p
## 626 AAAAACAAGGGAGATAT AAAGAG -0.7285579 3.123237e-03
## 734 AAAAACAAGGGAGATAT ACAGAT -0.6012655 2.294275e-02
## 811 AAAAACAAGGGAGATAT AACAAG -0.7042463 4.928523e-03
## 977 AAAAACAAGGGAGATAT AGAGGG -0.6405265 1.359210e-02
## 1516 AAAAACAAGGGAGATAT AAGAGG -0.8971486 1.364762e-05
## 1571 AAAAACAAGGGAGATAT AAAGGAAG -0.9541723 1.211222e-07
## 1984 AAAAACAAGGGAGATAT ACACATAT -0.7712784 1.237089e-03
## 2111 AAAAACAAGGGAGATAT ACATAG -0.5698117 3.339801e-02
res2.2[which(res2.2$column == "AAAAACAAGGGAGATAT" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] #10
## row column cor p
## 440 AG AAAAACAAGGGAGATAT -0.9375549 7.475579e-07
## 441 AAG AAAAACAAGGGAGATAT -0.7855231 8.696458e-04
## 443 AGAT AAAAACAAGGGAGATAT -0.6308160 1.556820e-02
## 445 AAAG AAAAACAAGGGAGATAT -0.8331100 2.155119e-04
## 448 AAGGAG AAAAACAAGGGAGATAT -0.6015439 2.286288e-02
## 449 AAGG AAAAACAAGGGAGATAT -0.8679448 5.725925e-05
## 452 AAAGG AAAAACAAGGGAGATAT -0.9343189 1.005095e-06
## 453 ACAG AAAAACAAGGGAGATAT -0.7646264 1.446444e-03
## 455 ACCT AAAAACAAGGGAGATAT -0.8996703 1.182541e-05
## 458 AAGAG AAAAACAAGGGAGATAT -0.9112247 5.823148e-06
res2.2[which(res2.2$row == "AAAAG" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] # 10
## row column cor p
## 701 AAAAG AGCAGG -0.7187996 3.771126e-03
## 738 AAAAG ACAGAT -0.5874160 2.718718e-02
## 776 AAAAG AAAGGG -0.5648877 3.530935e-02
## 896 AAAAG AGGAGGC -0.7875134 8.261672e-04
## 1116 AAAAG ACAGGC -0.7690186 1.305303e-03
## 1310 AAAAG ACT -0.7020574 5.124267e-03
## 1361 AAAAG AGAGAT -0.6021450 2.269117e-02
## 1688 AAAAG ACTGCT -0.7672257 1.361526e-03
## 1746 AAAAG ACACAT -0.8640085 6.768878e-05
## 2115 AAAAG ACATAG -0.5438580 4.439291e-02
res2.2[which(res2.2$column == "AAAAG" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] # 8
## row column cor p
## 565 AC AAAAG -0.6445830 0.0128259548
## 568 AACCCT AAAAG -0.8319098 0.0002243619
## 572 AGG AAAAG -0.7348410 0.0027548828
## 576 ACC AAAAG -0.8445019 0.0001447274
## 577 ATCC AAAAG -0.8182549 0.0003474049
## 580 AAGTCTGCACACACATACT AAAAG -0.7349647 0.0027479929
## 583 AT AAAAG -0.5905381 0.0261828839
## 593 AGGC AAAAG -0.6866443 0.0066804094
res2.2[which(res2.2$row == "AAAAACC" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] #1
## row column cor p
## 2016 AAAAACC ACACATAT -0.8069037 0.0004867536
nrow(res2.2[which(res2.2$column == "AAAAACC" & res2.2$cor < (-0.5) & res2.2$p < 0.05),]) # 15
## [1] 15
res2.2[which(res2.2$row == "AAAACC" & res2.2$cor < (-0.5) & res2.2$p < 0.05),] #0
## [1] row column cor p
## <0 rows> (or 0-length row.names)
nrow(res2.2[which(res2.2$column == "AAAACC" & res2.2$cor < (-0.5) & res2.2$p < 0.05),]) # 11
## [1] 11